# necessary libraries
library(readr)
## Warning: package 'readr' was built under R version 4.3.2
library(plyr)
## Warning: package 'plyr' was built under R version 4.3.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(tidyr)
library(stringr)
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.3.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages("fastmap")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'fastmap' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'fastmap'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\mujta\AppData\Local\R\win-library\4.3\00LOCK\fastmap\libs\x64\fastmap.dll
## to C:\Users\mujta\AppData\Local\R\win-library\4.3\fastmap\libs\x64\fastmap.dll:
## Permission denied
## Warning: restored 'fastmap'
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
install.packages("skimr")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'skimr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(skimr)
## Warning: package 'skimr' was built under R version 4.3.2
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 4.3.2
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
#Load the data set
HotelLisbon_data <- read_csv("C:/Users/mujta/OneDrive/Desktop/Data mining cvs.csv")
## New names:
## • `` -> `...1`
## Rows: 75000 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Nationality, Age, NameHash, DocIDHash, DistributionChannel, Market...
## dbl (26): ...1, ID, DaysSinceCreation, AverageLeadTime, LodgingRevenue, Othe...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(HotelLisbon_data)
dim(HotelLisbon_data)
## [1] 75000 32
#Let’s first investigate the raw data.
print(colnames(HotelLisbon_data))
## [1] "...1" "ID" "Nationality"
## [4] "Age" "DaysSinceCreation" "NameHash"
## [7] "DocIDHash" "AverageLeadTime" "LodgingRevenue"
## [10] "OtherRevenue" "BookingsCanceled" "BookingsNoShowed"
## [13] "BookingsCheckedIn" "PersonsNights" "RoomNights"
## [16] "DaysSinceLastStay" "DaysSinceFirstStay" "DistributionChannel"
## [19] "MarketSegment" "SRHighFloor" "SRLowFloor"
## [22] "SRAccessibleRoom" "SRMediumFloor" "SRBathtub"
## [25] "SRShower" "SRCrib" "SRKingSizeBed"
## [28] "SRTwinBed" "SRNearElevator" "SRAwayFromElevator"
## [31] "SRNoAlcoholInMiniBar" "SRQuietRoom"
skim(HotelLisbon_data)
| Name | HotelLisbon_data |
| Number of rows | 75000 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 6 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Nationality | 0 | 1 | 3 | 3 | 0 | 185 | 0 |
| Age | 0 | 1 | 1 | 4 | 0 | 105 | 0 |
| NameHash | 0 | 1 | 66 | 66 | 0 | 72600 | 0 |
| DocIDHash | 0 | 1 | 66 | 66 | 0 | 69343 | 0 |
| DistributionChannel | 0 | 1 | 6 | 23 | 0 | 4 | 0 |
| MarketSegment | 0 | 1 | 5 | 21 | 0 | 7 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| …1 | 0 | 1 | 37500.50 | 21650.78 | 1 | 18750.75 | 37500.5 | 56250.25 | 75000.0 | ▇▇▇▇▇ |
| ID | 0 | 1 | 41823.04 | 24144.59 | 1 | 20884.75 | 41824.5 | 62715.25 | 83590.0 | ▇▇▇▇▇ |
| DaysSinceCreation | 0 | 1 | 453.36 | 313.53 | 0 | 177.00 | 396.0 | 723.00 | 1095.0 | ▇▆▅▃▃ |
| AverageLeadTime | 0 | 1 | 66.21 | 87.82 | -1 | 0.00 | 29.0 | 103.00 | 588.0 | ▇▂▁▁▁ |
| LodgingRevenue | 0 | 1 | 298.99 | 374.00 | 0 | 59.00 | 234.0 | 403.20 | 21781.0 | ▇▁▁▁▁ |
| OtherRevenue | 0 | 1 | 67.49 | 110.63 | 0 | 2.00 | 38.5 | 88.00 | 5105.5 | ▇▁▁▁▁ |
| BookingsCanceled | 0 | 1 | 0.00 | 0.07 | 0 | 0.00 | 0.0 | 0.00 | 9.0 | ▇▁▁▁▁ |
| BookingsNoShowed | 0 | 1 | 0.00 | 0.03 | 0 | 0.00 | 0.0 | 0.00 | 3.0 | ▇▁▁▁▁ |
| BookingsCheckedIn | 0 | 1 | 0.79 | 0.65 | 0 | 1.00 | 1.0 | 1.00 | 57.0 | ▇▁▁▁▁ |
| PersonsNights | 0 | 1 | 4.65 | 4.56 | 0 | 1.00 | 4.0 | 6.00 | 116.0 | ▇▁▁▁▁ |
| RoomNights | 0 | 1 | 2.36 | 2.28 | 0 | 1.00 | 2.0 | 4.00 | 185.0 | ▇▁▁▁▁ |
| DaysSinceLastStay | 0 | 1 | 400.89 | 347.27 | -1 | 26.00 | 366.0 | 694.00 | 1104.0 | ▇▃▃▃▂ |
| DaysSinceFirstStay | 0 | 1 | 403.13 | 348.08 | -1 | 27.00 | 369.0 | 698.00 | 1186.0 | ▇▃▃▃▂ |
| SRHighFloor | 0 | 1 | 0.05 | 0.21 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRLowFloor | 0 | 1 | 0.00 | 0.04 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRAccessibleRoom | 0 | 1 | 0.00 | 0.02 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRMediumFloor | 0 | 1 | 0.00 | 0.03 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRBathtub | 0 | 1 | 0.00 | 0.05 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRShower | 0 | 1 | 0.00 | 0.04 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRCrib | 0 | 1 | 0.01 | 0.11 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRKingSizeBed | 0 | 1 | 0.35 | 0.48 | 0 | 0.00 | 0.0 | 1.00 | 1.0 | ▇▁▁▁▅ |
| SRTwinBed | 0 | 1 | 0.14 | 0.35 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▂ |
| SRNearElevator | 0 | 1 | 0.00 | 0.02 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRAwayFromElevator | 0 | 1 | 0.00 | 0.06 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRNoAlcoholInMiniBar | 0 | 1 | 0.00 | 0.01 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
| SRQuietRoom | 0 | 1 | 0.09 | 0.28 | 0 | 0.00 | 0.0 | 0.00 | 1.0 | ▇▁▁▁▁ |
str(HotelLisbon_data)
## spc_tbl_ [75,000 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:75000] 1 2 3 4 5 6 7 8 9 10 ...
## $ ID : num [1:75000] 20351 62663 30398 39784 17929 ...
## $ Nationality : chr [1:75000] "BRA" "CAN" "PHL" "FRA" ...
## $ Age : chr [1:75000] "85" "30" "70" "31" ...
## $ DaysSinceCreation : num [1:75000] 733 178 564 430 785 314 794 237 817 750 ...
## $ NameHash : chr [1:75000] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC" "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22" "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED" "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC" ...
## $ DocIDHash : chr [1:75000] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490" "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A" "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8" "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54" ...
## $ AverageLeadTime : num [1:75000] 41 119 94 47 148 0 33 230 213 157 ...
## $ LodgingRevenue : num [1:75000] 53 1041 1512 219 269 ...
## $ OtherRevenue : num [1:75000] 14 162 72 146 58.5 ...
## $ BookingsCanceled : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ BookingsNoShowed : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ BookingsCheckedIn : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
## $ PersonsNights : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
## $ RoomNights : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
## $ DaysSinceLastStay : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
## $ DaysSinceFirstStay : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
## $ DistributionChannel : chr [1:75000] "Travel Agent/Operator" "Travel Agent/Operator" "Travel Agent/Operator" "Travel Agent/Operator" ...
## $ MarketSegment : chr [1:75000] "Travel Agent/Operator" "Other" "Other" "Travel Agent/Operator" ...
## $ SRHighFloor : num [1:75000] 0 1 0 0 0 0 0 0 0 0 ...
## $ SRLowFloor : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRAccessibleRoom : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRMediumFloor : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRBathtub : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRShower : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRCrib : num [1:75000] 0 0 0 1 0 0 0 0 0 0 ...
## $ SRKingSizeBed : num [1:75000] 0 0 0 0 0 1 0 0 1 0 ...
## $ SRTwinBed : num [1:75000] 1 0 0 0 0 0 0 0 0 0 ...
## $ SRNearElevator : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRAwayFromElevator : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRNoAlcoholInMiniBar: num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ SRQuietRoom : num [1:75000] 0 0 0 0 0 0 0 0 1 0 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. ID = col_double(),
## .. Nationality = col_character(),
## .. Age = col_character(),
## .. DaysSinceCreation = col_double(),
## .. NameHash = col_character(),
## .. DocIDHash = col_character(),
## .. AverageLeadTime = col_double(),
## .. LodgingRevenue = col_double(),
## .. OtherRevenue = col_double(),
## .. BookingsCanceled = col_double(),
## .. BookingsNoShowed = col_double(),
## .. BookingsCheckedIn = col_double(),
## .. PersonsNights = col_double(),
## .. RoomNights = col_double(),
## .. DaysSinceLastStay = col_double(),
## .. DaysSinceFirstStay = col_double(),
## .. DistributionChannel = col_character(),
## .. MarketSegment = col_character(),
## .. SRHighFloor = col_double(),
## .. SRLowFloor = col_double(),
## .. SRAccessibleRoom = col_double(),
## .. SRMediumFloor = col_double(),
## .. SRBathtub = col_double(),
## .. SRShower = col_double(),
## .. SRCrib = col_double(),
## .. SRKingSizeBed = col_double(),
## .. SRTwinBed = col_double(),
## .. SRNearElevator = col_double(),
## .. SRAwayFromElevator = col_double(),
## .. SRNoAlcoholInMiniBar = col_double(),
## .. SRQuietRoom = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# HotelLisbon_data is my data frame
# Install and load the knitr package
install.packages("knitr")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'knitr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(knitr)
## Warning: package 'knitr' was built under R version 4.3.2
# Create a data frame with variable names and their types
variable_summary <- data.frame(
Variable = names(HotelLisbon_data),
Type = sapply(HotelLisbon_data, class)
)
# Use kable to create a simple table
kable(variable_summary, caption = "Variable Summary")
| Variable | Type | |
|---|---|---|
| …1 | …1 | numeric |
| ID | ID | numeric |
| Nationality | Nationality | character |
| Age | Age | character |
| DaysSinceCreation | DaysSinceCreation | numeric |
| NameHash | NameHash | character |
| DocIDHash | DocIDHash | character |
| AverageLeadTime | AverageLeadTime | numeric |
| LodgingRevenue | LodgingRevenue | numeric |
| OtherRevenue | OtherRevenue | numeric |
| BookingsCanceled | BookingsCanceled | numeric |
| BookingsNoShowed | BookingsNoShowed | numeric |
| BookingsCheckedIn | BookingsCheckedIn | numeric |
| PersonsNights | PersonsNights | numeric |
| RoomNights | RoomNights | numeric |
| DaysSinceLastStay | DaysSinceLastStay | numeric |
| DaysSinceFirstStay | DaysSinceFirstStay | numeric |
| DistributionChannel | DistributionChannel | character |
| MarketSegment | MarketSegment | character |
| SRHighFloor | SRHighFloor | numeric |
| SRLowFloor | SRLowFloor | numeric |
| SRAccessibleRoom | SRAccessibleRoom | numeric |
| SRMediumFloor | SRMediumFloor | numeric |
| SRBathtub | SRBathtub | numeric |
| SRShower | SRShower | numeric |
| SRCrib | SRCrib | numeric |
| SRKingSizeBed | SRKingSizeBed | numeric |
| SRTwinBed | SRTwinBed | numeric |
| SRNearElevator | SRNearElevator | numeric |
| SRAwayFromElevator | SRAwayFromElevator | numeric |
| SRNoAlcoholInMiniBar | SRNoAlcoholInMiniBar | numeric |
| SRQuietRoom | SRQuietRoom | numeric |
#Let’s see what we can evaluate if we view the summary statistics
summary(HotelLisbon_data)
## ...1 ID Nationality Age
## Min. : 1 Min. : 1 Length:75000 Length:75000
## 1st Qu.:18751 1st Qu.:20885 Class :character Class :character
## Median :37501 Median :41825 Mode :character Mode :character
## Mean :37501 Mean :41823
## 3rd Qu.:56250 3rd Qu.:62715
## Max. :75000 Max. :83590
## DaysSinceCreation NameHash DocIDHash AverageLeadTime
## Min. : 0.0 Length:75000 Length:75000 Min. : -1.00
## 1st Qu.: 177.0 Class :character Class :character 1st Qu.: 0.00
## Median : 396.0 Mode :character Mode :character Median : 29.00
## Mean : 453.4 Mean : 66.21
## 3rd Qu.: 723.0 3rd Qu.:103.00
## Max. :1095.0 Max. :588.00
## LodgingRevenue OtherRevenue BookingsCanceled BookingsNoShowed
## Min. : 0.0 Min. : 0.00 Min. :0.000000 Min. :0.0000000
## 1st Qu.: 59.0 1st Qu.: 2.00 1st Qu.:0.000000 1st Qu.:0.0000000
## Median : 234.0 Median : 38.50 Median :0.000000 Median :0.0000000
## Mean : 299.0 Mean : 67.49 Mean :0.001987 Mean :0.0005867
## 3rd Qu.: 403.2 3rd Qu.: 88.00 3rd Qu.:0.000000 3rd Qu.:0.0000000
## Max. :21781.0 Max. :5105.50 Max. :9.000000 Max. :3.0000000
## BookingsCheckedIn PersonsNights RoomNights DaysSinceLastStay
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : -1.0
## 1st Qu.: 1.0000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 26.0
## Median : 1.0000 Median : 4.000 Median : 2.000 Median : 366.0
## Mean : 0.7934 Mean : 4.647 Mean : 2.358 Mean : 400.9
## 3rd Qu.: 1.0000 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 694.0
## Max. :57.0000 Max. :116.000 Max. :185.000 Max. :1104.0
## DaysSinceFirstStay DistributionChannel MarketSegment SRHighFloor
## Min. : -1.0 Length:75000 Length:75000 Min. :0.00000
## 1st Qu.: 27.0 Class :character Class :character 1st Qu.:0.00000
## Median : 369.0 Mode :character Mode :character Median :0.00000
## Mean : 403.1 Mean :0.04753
## 3rd Qu.: 698.0 3rd Qu.:0.00000
## Max. :1186.0 Max. :1.00000
## SRLowFloor SRAccessibleRoom SRMediumFloor SRBathtub
## Min. :0.000000 Min. :0.00000 Min. :0.0000000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000000 1st Qu.:0.00000
## Median :0.000000 Median :0.00000 Median :0.0000000 Median :0.00000
## Mean :0.001373 Mean :0.00028 Mean :0.0008933 Mean :0.00288
## 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.0000000 3rd Qu.:0.00000
## Max. :1.000000 Max. :1.00000 Max. :1.0000000 Max. :1.00000
## SRShower SRCrib SRKingSizeBed SRTwinBed
## Min. :0.000000 Min. :0.00000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.000000 Median :0.00000 Median :0.0000 Median :0.000
## Mean :0.001787 Mean :0.01325 Mean :0.3522 Mean :0.143
## 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :1.000000 Max. :1.00000 Max. :1.0000 Max. :1.000
## SRNearElevator SRAwayFromElevator SRNoAlcoholInMiniBar SRQuietRoom
## Min. :0.00000 Min. :0.000000 Min. :0.0000000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.0000000 1st Qu.:0.00000
## Median :0.00000 Median :0.000000 Median :0.0000000 Median :0.00000
## Mean :0.00036 Mean :0.003613 Mean :0.0001333 Mean :0.08803
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.0000000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.000000 Max. :1.0000000 Max. :1.00000
library(knitr)
# Create a summary table
summary_table <- summary(HotelLisbon_data)
# Use kable to create a table
kable(summary_table, format = "markdown", caption = "Summary Statistics")
| …1 | ID | Nationality | Age | DaysSinceCreation | NameHash | DocIDHash | AverageLeadTime | LodgingRevenue | OtherRevenue | BookingsCanceled | BookingsNoShowed | BookingsCheckedIn | PersonsNights | RoomNights | DaysSinceLastStay | DaysSinceFirstStay | DistributionChannel | MarketSegment | SRHighFloor | SRLowFloor | SRAccessibleRoom | SRMediumFloor | SRBathtub | SRShower | SRCrib | SRKingSizeBed | SRTwinBed | SRNearElevator | SRAwayFromElevator | SRNoAlcoholInMiniBar | SRQuietRoom | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min. : 1 | Min. : 1 | Length:75000 | Length:75000 | Min. : 0.0 | Length:75000 | Length:75000 | Min. : -1.00 | Min. : 0.0 | Min. : 0.00 | Min. :0.000000 | Min. :0.0000000 | Min. : 0.0000 | Min. : 0.000 | Min. : 0.000 | Min. : -1.0 | Min. : -1.0 | Length:75000 | Length:75000 | Min. :0.00000 | Min. :0.000000 | Min. :0.00000 | Min. :0.0000000 | Min. :0.00000 | Min. :0.000000 | Min. :0.00000 | Min. :0.0000 | Min. :0.000 | Min. :0.00000 | Min. :0.000000 | Min. :0.0000000 | Min. :0.00000 | |
| 1st Qu.:18751 | 1st Qu.:20885 | Class :character | Class :character | 1st Qu.: 177.0 | Class :character | Class :character | 1st Qu.: 0.00 | 1st Qu.: 59.0 | 1st Qu.: 2.00 | 1st Qu.:0.000000 | 1st Qu.:0.0000000 | 1st Qu.: 1.0000 | 1st Qu.: 1.000 | 1st Qu.: 1.000 | 1st Qu.: 26.0 | 1st Qu.: 27.0 | Class :character | Class :character | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1st Qu.:0.00000 | 1st Qu.:0.0000000 | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1st Qu.:0.00000 | 1st Qu.:0.0000 | 1st Qu.:0.000 | 1st Qu.:0.00000 | 1st Qu.:0.000000 | 1st Qu.:0.0000000 | 1st Qu.:0.00000 | |
| Median :37501 | Median :41825 | Mode :character | Mode :character | Median : 396.0 | Mode :character | Mode :character | Median : 29.00 | Median : 234.0 | Median : 38.50 | Median :0.000000 | Median :0.0000000 | Median : 1.0000 | Median : 4.000 | Median : 2.000 | Median : 366.0 | Median : 369.0 | Mode :character | Mode :character | Median :0.00000 | Median :0.000000 | Median :0.00000 | Median :0.0000000 | Median :0.00000 | Median :0.000000 | Median :0.00000 | Median :0.0000 | Median :0.000 | Median :0.00000 | Median :0.000000 | Median :0.0000000 | Median :0.00000 | |
| Mean :37501 | Mean :41823 | NA | NA | Mean : 453.4 | NA | NA | Mean : 66.21 | Mean : 299.0 | Mean : 67.49 | Mean :0.001987 | Mean :0.0005867 | Mean : 0.7934 | Mean : 4.647 | Mean : 2.358 | Mean : 400.9 | Mean : 403.1 | NA | NA | Mean :0.04753 | Mean :0.001373 | Mean :0.00028 | Mean :0.0008933 | Mean :0.00288 | Mean :0.001787 | Mean :0.01325 | Mean :0.3522 | Mean :0.143 | Mean :0.00036 | Mean :0.003613 | Mean :0.0001333 | Mean :0.08803 | |
| 3rd Qu.:56250 | 3rd Qu.:62715 | NA | NA | 3rd Qu.: 723.0 | NA | NA | 3rd Qu.:103.00 | 3rd Qu.: 403.2 | 3rd Qu.: 88.00 | 3rd Qu.:0.000000 | 3rd Qu.:0.0000000 | 3rd Qu.: 1.0000 | 3rd Qu.: 6.000 | 3rd Qu.: 4.000 | 3rd Qu.: 694.0 | 3rd Qu.: 698.0 | NA | NA | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | 3rd Qu.:0.00000 | 3rd Qu.:0.0000000 | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | 3rd Qu.:0.00000 | 3rd Qu.:1.0000 | 3rd Qu.:0.000 | 3rd Qu.:0.00000 | 3rd Qu.:0.000000 | 3rd Qu.:0.0000000 | 3rd Qu.:0.00000 | |
| Max. :75000 | Max. :83590 | NA | NA | Max. :1095.0 | NA | NA | Max. :588.00 | Max. :21781.0 | Max. :5105.50 | Max. :9.000000 | Max. :3.0000000 | Max. :57.0000 | Max. :116.000 | Max. :185.000 | Max. :1104.0 | Max. :1186.0 | NA | NA | Max. :1.00000 | Max. :1.000000 | Max. :1.00000 | Max. :1.0000000 | Max. :1.00000 | Max. :1.000000 | Max. :1.00000 | Max. :1.0000 | Max. :1.000 | Max. :1.00000 | Max. :1.000000 | Max. :1.0000000 | Max. :1.00000 |
Let’s see if we have any missing Values
# Checking for missing values
missing_values <- colSums(is.na(HotelLisbon_data))
missing_values
## ...1 ID Nationality
## 0 0 0
## Age DaysSinceCreation NameHash
## 0 0 0
## DocIDHash AverageLeadTime LodgingRevenue
## 0 0 0
## OtherRevenue BookingsCanceled BookingsNoShowed
## 0 0 0
## BookingsCheckedIn PersonsNights RoomNights
## 0 0 0
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## 0 0 0
## MarketSegment SRHighFloor SRLowFloor
## 0 0 0
## SRAccessibleRoom SRMediumFloor SRBathtub
## 0 0 0
## SRShower SRCrib SRKingSizeBed
## 0 0 0
## SRTwinBed SRNearElevator SRAwayFromElevator
## 0 0 0
## SRNoAlcoholInMiniBar SRQuietRoom
## 0 0
#seems like there are no missing values.
#At first glance at reading the summary statistics, “Age” is considered as character, which should not be the case. There might be characters in some data.
#Yes, after, checking the data set, there is “Null” values for customers from “Nationality” PRT. WE need to view the whole data
#Let’s investigate all the Null Values if there are any
# Check for null values in the entire dataset
null_values <- sum(is.na(HotelLisbon_data))
null_values_age <- sum(is.na(HotelLisbon_data$Age))
null_values
## [1] 0
null_values_age
## [1] 0
#There seem to be no null values
# Find rows where Age is NULL
null_age_rows <- HotelLisbon_data$Age == "NULL"
# Create a table of Nationality and count occurrences of NULL in Age
null_age_by_country <- table(HotelLisbon_data$Nationality[null_age_rows])
# Print the result
print(null_age_by_country)
##
## ESP PAN PRT
## 1 1 3368
#Only age has a NULL value
#from observaing the data set “NULL” in CAPS is written on age.
#Let’s see which customers have NULL values age.
customers_with_null_age <- HotelLisbon_data %>%
filter(Age == "NULL") %>%
select(ID, Nationality, NameHash, DocIDHash)
print(customers_with_null_age)
## # A tibble: 3,370 × 4
## ID Nationality NameHash DocIDHash
## <dbl> <chr> <chr> <chr>
## 1 45142 PRT 0x76DC0F0F62F831421F112A43C0E872118157C520FBE246… 0x5FA1E0…
## 2 35353 PRT 0x03E34CF88B9A651D29C435163112746CB79CD68D7B3FC0… 0x5FA1E0…
## 3 28863 PRT 0xCF7C76165178E1DB83FA1ED2B2BECCA075D8AAC8104799… 0x5FA1E0…
## 4 36797 PRT 0xDC80CCCAFE75A6647DA264B3AB63DF7BBE00B810E99350… 0x5FA1E0…
## 5 10793 PRT 0x01FBEA469D671F06FF4954F108BE7BC6D56A222F429815… 0x5FA1E0…
## 6 6346 PRT 0xDA7528770C8A14A3026F73F0B61C983F4EE721B7992BEE… 0x5FA1E0…
## 7 3245 PRT 0xE8E01F1987C2E0ADB97CEA6FA23FAD54308C4FC3D518FE… 0x5FA1E0…
## 8 2199 PRT 0x12DBDE199AE9016BC597665C6429B4980CAE2E6B51F53D… 0x5FA1E0…
## 9 70109 PRT 0x09D3589CED237C1402EAB5B7ADEA72775360592D6F1B78… 0x5FA1E0…
## 10 64592 PRT 0x51546E434D6D257A120E19AA3310943085A22DE2248F58… 0x5FA1E0…
## # ℹ 3,360 more rows
#There are many Nationality PRT with NULL age value and #found PAN, ESP and PRT in Nationality as NULL too #we need to make Age into a Numerical/Quantitative variable.
#Let’s check for negative, NA, NULL values in variables
# Number of negative observations in each variable
negative_values <- sapply(HotelLisbon_data, function(x) sum(x < 0, na.rm = TRUE))
print(negative_values)
## ...1 ID Nationality
## 0 0 0
## Age DaysSinceCreation NameHash
## 15 0 0
## DocIDHash AverageLeadTime LodgingRevenue
## 0 10 0
## OtherRevenue BookingsCanceled BookingsNoShowed
## 0 0 0
## BookingsCheckedIn PersonsNights RoomNights
## 0 0 0
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## 17886 17886 0
## MarketSegment SRHighFloor SRLowFloor
## 0 0 0
## SRAccessibleRoom SRMediumFloor SRBathtub
## 0 0 0
## SRShower SRCrib SRKingSizeBed
## 0 0 0
## SRTwinBed SRNearElevator SRAwayFromElevator
## 0 0 0
## SRNoAlcoholInMiniBar SRQuietRoom
## 0 0
# Number of NA observations in each variable
na_values <- sapply(HotelLisbon_data, function(x) sum(is.na(x)))
print(na_values)
## ...1 ID Nationality
## 0 0 0
## Age DaysSinceCreation NameHash
## 0 0 0
## DocIDHash AverageLeadTime LodgingRevenue
## 0 0 0
## OtherRevenue BookingsCanceled BookingsNoShowed
## 0 0 0
## BookingsCheckedIn PersonsNights RoomNights
## 0 0 0
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## 0 0 0
## MarketSegment SRHighFloor SRLowFloor
## 0 0 0
## SRAccessibleRoom SRMediumFloor SRBathtub
## 0 0 0
## SRShower SRCrib SRKingSizeBed
## 0 0 0
## SRTwinBed SRNearElevator SRAwayFromElevator
## 0 0 0
## SRNoAlcoholInMiniBar SRQuietRoom
## 0 0
# Number of NULL observations in each variable (assuming NULL is written as "NULL" in character columns)
null_values_char <- sapply(HotelLisbon_data, function(x) sum(x == "NULL"))
print(null_values_char)
## ...1 ID Nationality
## 0 0 0
## Age DaysSinceCreation NameHash
## 3370 0 0
## DocIDHash AverageLeadTime LodgingRevenue
## 0 0 0
## OtherRevenue BookingsCanceled BookingsNoShowed
## 0 0 0
## BookingsCheckedIn PersonsNights RoomNights
## 0 0 0
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## 0 0 0
## MarketSegment SRHighFloor SRLowFloor
## 0 0 0
## SRAccessibleRoom SRMediumFloor SRBathtub
## 0 0 0
## SRShower SRCrib SRKingSizeBed
## 0 0 0
## SRTwinBed SRNearElevator SRAwayFromElevator
## 0 0 0
## SRNoAlcoholInMiniBar SRQuietRoom
## 0 0
# Number of NULL observations in each variable (assuming NULL is written as NA in numeric columns)
null_values_na <- sapply(HotelLisbon_data, function(x) sum(is.na(x)))
print(null_values_na)
## ...1 ID Nationality
## 0 0 0
## Age DaysSinceCreation NameHash
## 0 0 0
## DocIDHash AverageLeadTime LodgingRevenue
## 0 0 0
## OtherRevenue BookingsCanceled BookingsNoShowed
## 0 0 0
## BookingsCheckedIn PersonsNights RoomNights
## 0 0 0
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## 0 0 0
## MarketSegment SRHighFloor SRLowFloor
## 0 0 0
## SRAccessibleRoom SRMediumFloor SRBathtub
## 0 0 0
## SRShower SRCrib SRKingSizeBed
## 0 0 0
## SRTwinBed SRNearElevator SRAwayFromElevator
## 0 0 0
## SRNoAlcoholInMiniBar SRQuietRoom
## 0 0
#Let’s specify which Variables and how many observations haev NULL, NA or negative values
# Create a data frame to store the counts
observations_summary <- data.frame(
Variable = names(HotelLisbon_data),
Negative = sapply(HotelLisbon_data, function(x) sum(x < 0, na.rm = TRUE)),
NA_Count = sapply(HotelLisbon_data, function(x) sum(is.na(x))),
NULL_Count = sapply(HotelLisbon_data, function(x) sum(x == "NULL"))
)
# Print the summary
print(observations_summary)
## Variable Negative NA_Count NULL_Count
## ...1 ...1 0 0 0
## ID ID 0 0 0
## Nationality Nationality 0 0 0
## Age Age 15 0 3370
## DaysSinceCreation DaysSinceCreation 0 0 0
## NameHash NameHash 0 0 0
## DocIDHash DocIDHash 0 0 0
## AverageLeadTime AverageLeadTime 10 0 0
## LodgingRevenue LodgingRevenue 0 0 0
## OtherRevenue OtherRevenue 0 0 0
## BookingsCanceled BookingsCanceled 0 0 0
## BookingsNoShowed BookingsNoShowed 0 0 0
## BookingsCheckedIn BookingsCheckedIn 0 0 0
## PersonsNights PersonsNights 0 0 0
## RoomNights RoomNights 0 0 0
## DaysSinceLastStay DaysSinceLastStay 17886 0 0
## DaysSinceFirstStay DaysSinceFirstStay 17886 0 0
## DistributionChannel DistributionChannel 0 0 0
## MarketSegment MarketSegment 0 0 0
## SRHighFloor SRHighFloor 0 0 0
## SRLowFloor SRLowFloor 0 0 0
## SRAccessibleRoom SRAccessibleRoom 0 0 0
## SRMediumFloor SRMediumFloor 0 0 0
## SRBathtub SRBathtub 0 0 0
## SRShower SRShower 0 0 0
## SRCrib SRCrib 0 0 0
## SRKingSizeBed SRKingSizeBed 0 0 0
## SRTwinBed SRTwinBed 0 0 0
## SRNearElevator SRNearElevator 0 0 0
## SRAwayFromElevator SRAwayFromElevator 0 0 0
## SRNoAlcoholInMiniBar SRNoAlcoholInMiniBar 0 0 0
## SRQuietRoom SRQuietRoom 0 0 0
#Variables with Negative Values: #Age: 15 negative values #AverageLeadTime: 10 negative values #DaysSinceLastStay: 17,886 negative values #DaysSinceFirstStay: 17,886 negative values
#Variables with NA Values: #No variable has NA values. #Variables with “NULL” Values: #Age: 3,370 “NULL” values
#Let’s first investigate Unique values in our data set
# Function to get limited unique values for each variable
get_limited_unique_values <- function(data, limit = 20) {
limited_unique_values <- lapply(data, function(x) unique(x)[1:min(length(unique(x)), limit)])
return(limited_unique_values)
}
# Set a limit for the number of unique values to display
display_limit <- 20
# Get limited unique values for each variable in HotelLisbon_data
limited_unique_values_list <- get_limited_unique_values(HotelLisbon_data, display_limit)
# Print limited unique values
for (i in seq_along(limited_unique_values_list)) {
cat("Variable:", names(limited_unique_values_list)[i], "\n")
print(limited_unique_values_list[[i]])
cat("\n")
}
## Variable: ...1
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##
## Variable: ID
## [1] 20351 62663 30398 39784 17929 46541 17258 55277 15704 19803 53746 4649
## [13] 10264 2629 23494 35607 40960 18689 36582 42337
##
## Variable: Nationality
## [1] "BRA" "CAN" "PHL" "FRA" "HUN" "ITA" "EST" "BEL" "PRT" "GBR" "USA" "COL"
## [13] "CHN" "IRL" "DEU" "CHE" "AUS" "ESP" "BIH" "ISR"
##
## Variable: Age
## [1] "85" "30" "70" "31" "29" "14" "49" "55" "53" "47" "44" "65" "62" "38" "20"
## [16] "41" "19" "69" "28" "40"
##
## Variable: DaysSinceCreation
## [1] 733 178 564 430 785 314 794 237 817 750 249 990 897 1027 667
## [16] 486 411 771 472 386
##
## Variable: NameHash
## [1] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC"
## [2] "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22"
## [3] "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED"
## [4] "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC"
## [5] "0xCB5BBB03BC55C10987902657B83708EFF49192751D43942E784EA48C92E0F6B0"
## [6] "0x5199CD2B356E8BD8D84D9CDDFEBF69771B04D65D7575C5AAAB5055ACA261EF90"
## [7] "0xC4CDDFC6B120BBAAC63F4FFD1C39B5BFBC03C0992973321EA972AA8A714C4E36"
## [8] "0x5D1CB184D92EE7F095CA22D47D1D3F867A77C0841F8C94CA2C1F4944803E47A1"
## [9] "0x9F4162C841944D33574F243D2256001F1018A4F3DA5D8DF115DB2CB78BD30EEB"
## [10] "0x28E9485A3E06E2C0039D225160A87EF025D36489847555BAD0D610746B28F05D"
## [11] "0x3614128425809C4CD1438137D37BE65D0B422084B230DCFD5D4544668F379825"
## [12] "0xF7B61527AD41493413A8138C38F391BEE177F12CCF52D2AEED5652A8BD92C132"
## [13] "0xEC72C7BAE4558A9270FF8FDBCEC759B343C6AEF79F37B2DA464B759240124B2E"
## [14] "0x59CB21FECE9AD0AEDF436B5F745AFD4DFEF3F8EB010593FA89F36BAE87AD4A15"
## [15] "0x7A56EB69D6A8EB1CBDD32AF9285DE543353EE1258CE09E7CA3C9EA761B96A446"
## [16] "0x0CB7D63FEB45A9AC3641E5974598C3DBF713B42382C3E036F119972E4CECCAE9"
## [17] "0x1E3A0E71BAC664E904D7C2F145DF260C5973D26DD460A290C2D33CE02AD2CF98"
## [18] "0x6F820AF2D725F3CDA8C315494B6F01B69EF1AF35FD337185DE1A37065B28B13E"
## [19] "0xCE236402009BD6A678F698D67EC43B1AA38C1BC4207DCEA84D8128745432FD04"
## [20] "0xA002C5DE86431272037EEA01E58CD2D09359CB905C2578C64913E8BB6C371B64"
##
## Variable: DocIDHash
## [1] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490"
## [2] "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A"
## [3] "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8"
## [4] "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54"
## [5] "0xC510E8C80CF916E3A3F30A5F04DC3A8022F4F36693DF5BFED282F9575C55EC12"
## [6] "0x639BC1149F7DCEE41997906F58BF2032CE69AD41FA5404CBB9117571DEA7FF7B"
## [7] "0xBA6D5D11497ECF19C7A0295CB3BE0CEFE7649391F35FB1768C0409A5152DBB29"
## [8] "0x547AF4CEFB803EBB128C5E72B279F8E6F9ABD1D9F39804C125F1D4E75D112EBC"
## [9] "0xD53AAB77FC4500611E7210C0374ACE1EEEE0CDA884579764DE5FD8E3BF92767D"
## [10] "0xC15D6AF40193DAFB50FF3081F9172A886579F7A6F05E0A3041B5A1EA11E1074A"
## [11] "0xD68ADFC5BA8A13D376A57B9E0DEB2EB116235B749C6A60526EE8B9F20C2F80F5"
## [12] "0x416BDDF55E00D3DAFCF409BB2F6BFBB0EC2485746C1613D3C669E17A05C46E70"
## [13] "0xF7F8F90C807157F8DC908E3C755C8D41FB285DC8948968B982A99D81A1474276"
## [14] "0xA680C8A830675C0C29723634E1377786419648518D4673245679A9A152683A12"
## [15] "0x86F490B44CD068C0A09226E89AEA8A59BC9E5E3EAF20A6BBBA5EA7DB294C0A60"
## [16] "0xA67964202574DC2E30DF14A838D26B4982DBA18E7A3BF0E4A8E87F47CC701F3A"
## [17] "0x6EF04A842E6A189C2FB24BD3EC1733D848D190B2930E64E2BDE0FA96371EC7C7"
## [18] "0x44076C065C679BEB6426C095EA07613BDCB3EE2AD0F8355DBFBD515269C7BBAD"
## [19] "0xD5C981DA96B7B19EBED0A9F7F1DF7EBFE528797CB9B901CE3D46F7CC592A4F75"
## [20] "0x60F12E40D2C94F56652C0CA04BC56C0881734B3F56D3978A812778A6E4C6993E"
##
## Variable: AverageLeadTime
## [1] 41 119 94 47 148 0 33 230 213 157 1 16 71 167 6 91 7 24 84
## [20] 2
##
## Variable: LodgingRevenue
## [1] 53.00 1041.00 1512.00 219.00 268.80 0.00 218.00 870.00 1090.20
## [10] 168.00 368.00 281.47 270.24 448.80 225.00 467.95 536.00 126.00
## [19] 606.00 126.50
##
## Variable: OtherRevenue
## [1] 14.00 162.00 72.00 146.00 58.50 0.00 22.00 35.00 271.50 44.50
## [11] 61.75 94.50 226.00 77.50 69.00 142.00 36.00 118.00 167.00 70.00
##
## Variable: BookingsCanceled
## [1] 0 2 1 4 3 9
##
## Variable: BookingsNoShowed
## [1] 0 1 2 3
##
## Variable: BookingsCheckedIn
## [1] 1 0 2 3 8 4 7 12 10 5 34 6 13 9 11 29 14 57 19 15
##
## Variable: PersonsNights
## [1] 2 6 8 0 5 12 4 3 15 10 1 9 14 18 22 24 7 16 13 20
##
## Variable: RoomNights
## [1] 1 3 4 0 2 5 6 7 11 8 14 12 9 15 19 13 10 20 29 16
##
## Variable: DaysSinceLastStay
## [1] 734 181 568 433 789 -1 796 242 821 753 253 993 901 673 488 415 773 476 388
## [20] 187
##
## Variable: DaysSinceFirstStay
## [1] 734 181 568 433 789 -1 796 242 821 753 253 993 901 673 488 415 773 476 388
## [20] 190
##
## Variable: DistributionChannel
## [1] "Travel Agent/Operator" "Direct"
## [3] "Corporate" "Electronic Distribution"
##
## Variable: MarketSegment
## [1] "Travel Agent/Operator" "Other" "Direct"
## [4] "Groups" "Aviation" "Complementary"
## [7] "Corporate"
##
## Variable: SRHighFloor
## [1] 0 1
##
## Variable: SRLowFloor
## [1] 0 1
##
## Variable: SRAccessibleRoom
## [1] 0 1
##
## Variable: SRMediumFloor
## [1] 0 1
##
## Variable: SRBathtub
## [1] 0 1
##
## Variable: SRShower
## [1] 0 1
##
## Variable: SRCrib
## [1] 0 1
##
## Variable: SRKingSizeBed
## [1] 0 1
##
## Variable: SRTwinBed
## [1] 1 0
##
## Variable: SRNearElevator
## [1] 0 1
##
## Variable: SRAwayFromElevator
## [1] 0 1
##
## Variable: SRNoAlcoholInMiniBar
## [1] 0 1
##
## Variable: SRQuietRoom
## [1] 0 1
#Let’s fix Age variable first
# Convert Age to numeric
HotelLisbon_data$Age <- as.numeric(as.character(HotelLisbon_data$Age))
## Warning: NAs introduced by coercion
# Replace missing values with median
median_age <- median(HotelLisbon_data$Age, na.rm = TRUE)
HotelLisbon_data$Age[is.na(HotelLisbon_data$Age)] <- median_age
# Print the median age
cat("Median Age:", median_age, "\n")
## Median Age: 46
# Check for missing values after replacement
sum(is.na(HotelLisbon_data$Age))
## [1] 0
#Age: 15 negative values #Let’s find out which ones these are and why
# Identify rows with negative Age values
negative_age_rows <- HotelLisbon_data$Age < 0
# Display the rows with negative Age values
rows_with_negative_age <- HotelLisbon_data[negative_age_rows, ]
# Print the result
print(rows_with_negative_age)
## # A tibble: 15 × 32
## ...1 ID Nationality Age DaysSinceCreation NameHash DocIDHash
## <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr>
## 1 2449 8361 DEU -7 927 0x72A31262A688A38B… 0x539E60…
## 2 8095 6752 GBR -11 953 0xDB672419DADD95C4… 0xB8599D…
## 3 16751 14688 PYF -1 832 0xF02B0B5CAD9B762F… 0x7F8199…
## 4 19578 8629 NLD -10 923 0xF5C3782374B119A5… 0xB0B8AE…
## 5 22127 2054 BIH -7 1034 0x816353557EFCF6A1… 0xE2B44F…
## 6 24083 57343 PRT -6 220 0xE327757DEF4F79D6… 0x6EB53E…
## 7 26007 5129 FRA -9 981 0x6471098D9873D3D9… 0x28EE0D…
## 8 31440 20406 USA -9 732 0x4464A2E5E3D712E9… 0xBC3566…
## 9 35378 15105 DEU -11 827 0x8338DF4BF7DFD49E… 0x8B1995…
## 10 36493 16975 FRA -1 799 0xC2DC5508F5A0CF5C… 0x995201…
## 11 44102 20329 DZA -1 733 0x599FC8CB50ED9179… 0x731067…
## 12 44118 14322 PRT -10 837 0x5202CE5913A8D676… 0x2BAA6D…
## 13 49911 10663 DEU -10 891 0x77AF5AA2214FAAC1… 0x4AFD62…
## 14 57657 9920 DZA -6 903 0x8106B7FC258F0757… 0xCACE52…
## 15 73661 10756 PRT -6 891 0x6EC142C9678B27CE… 0x3DD8D4…
## # ℹ 25 more variables: AverageLeadTime <dbl>, LodgingRevenue <dbl>,
## # OtherRevenue <dbl>, BookingsCanceled <dbl>, BookingsNoShowed <dbl>,
## # BookingsCheckedIn <dbl>, PersonsNights <dbl>, RoomNights <dbl>,
## # DaysSinceLastStay <dbl>, DaysSinceFirstStay <dbl>,
## # DistributionChannel <chr>, MarketSegment <chr>, SRHighFloor <dbl>,
## # SRLowFloor <dbl>, SRAccessibleRoom <dbl>, SRMediumFloor <dbl>,
## # SRBathtub <dbl>, SRShower <dbl>, SRCrib <dbl>, SRKingSizeBed <dbl>, …
#there seems to be inaccuarcy in age as well. There are negative values in ages and there are single digit ages, which can’t be right as a Hotel you cannot entertain children.
# Display the class of each variable
sapply(HotelLisbon_data, class)
## ...1 ID Nationality
## "numeric" "numeric" "character"
## Age DaysSinceCreation NameHash
## "numeric" "numeric" "character"
## DocIDHash AverageLeadTime LodgingRevenue
## "character" "numeric" "numeric"
## OtherRevenue BookingsCanceled BookingsNoShowed
## "numeric" "numeric" "numeric"
## BookingsCheckedIn PersonsNights RoomNights
## "numeric" "numeric" "numeric"
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## "numeric" "numeric" "character"
## MarketSegment SRHighFloor SRLowFloor
## "character" "numeric" "numeric"
## SRAccessibleRoom SRMediumFloor SRBathtub
## "numeric" "numeric" "numeric"
## SRShower SRCrib SRKingSizeBed
## "numeric" "numeric" "numeric"
## SRTwinBed SRNearElevator SRAwayFromElevator
## "numeric" "numeric" "numeric"
## SRNoAlcoholInMiniBar SRQuietRoom
## "numeric" "numeric"
#let’s now change certain character variables into factors and Binary variables into factors
# Convert character variables to factor
HotelLisbon_data$Nationality <- as.factor(HotelLisbon_data$Nationality)
HotelLisbon_data$DistributionChannel <- as.factor(HotelLisbon_data$DistributionChannel)
HotelLisbon_data$MarketSegment <- as.factor(HotelLisbon_data$MarketSegment)
# Convert binary variables to factor
binary_vars <- c(
"SRHighFloor", "SRLowFloor", "SRAccessibleRoom",
"SRMediumFloor", "SRBathtub", "SRShower", "SRCrib",
"SRKingSizeBed", "SRTwinBed", "SRNearElevator",
"SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")
HotelLisbon_data[binary_vars] <- lapply(HotelLisbon_data[binary_vars], as.factor)
# Convert BookingsCheckedIn to numeric
HotelLisbon_data$BookingsCheckedIn <- as.numeric(HotelLisbon_data$BookingsCheckedIn)
HotelLisbon_data$BookingsNoShowed <- as.numeric(HotelLisbon_data$BookingsNoShowed)
HotelLisbon_data$BookingsCanceled <- as.numeric(HotelLisbon_data$BookingsCanceled)
# Verify changes
sapply(HotelLisbon_data, class)
## ...1 ID Nationality
## "numeric" "numeric" "factor"
## Age DaysSinceCreation NameHash
## "numeric" "numeric" "character"
## DocIDHash AverageLeadTime LodgingRevenue
## "character" "numeric" "numeric"
## OtherRevenue BookingsCanceled BookingsNoShowed
## "numeric" "numeric" "numeric"
## BookingsCheckedIn PersonsNights RoomNights
## "numeric" "numeric" "numeric"
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## "numeric" "numeric" "factor"
## MarketSegment SRHighFloor SRLowFloor
## "factor" "factor" "factor"
## SRAccessibleRoom SRMediumFloor SRBathtub
## "factor" "factor" "factor"
## SRShower SRCrib SRKingSizeBed
## "factor" "factor" "factor"
## SRTwinBed SRNearElevator SRAwayFromElevator
## "factor" "factor" "factor"
## SRNoAlcoholInMiniBar SRQuietRoom
## "factor" "factor"
str(HotelLisbon_data)
## spc_tbl_ [75,000 × 32] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:75000] 1 2 3 4 5 6 7 8 9 10 ...
## $ ID : num [1:75000] 20351 62663 30398 39784 17929 ...
## $ Nationality : Factor w/ 185 levels "ABW","AGO","AIA",..: 25 29 132 56 73 56 82 52 16 82 ...
## $ Age : num [1:75000] 85 30 70 31 29 14 49 55 53 47 ...
## $ DaysSinceCreation : num [1:75000] 733 178 564 430 785 314 794 237 817 750 ...
## $ NameHash : chr [1:75000] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC" "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22" "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED" "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC" ...
## $ DocIDHash : chr [1:75000] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490" "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A" "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8" "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54" ...
## $ AverageLeadTime : num [1:75000] 41 119 94 47 148 0 33 230 213 157 ...
## $ LodgingRevenue : num [1:75000] 53 1041 1512 219 269 ...
## $ OtherRevenue : num [1:75000] 14 162 72 146 58.5 ...
## $ BookingsCanceled : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ BookingsNoShowed : num [1:75000] 0 0 0 0 0 0 0 0 0 0 ...
## $ BookingsCheckedIn : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
## $ PersonsNights : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
## $ RoomNights : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
## $ DaysSinceLastStay : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
## $ DaysSinceFirstStay : num [1:75000] 734 181 568 433 789 -1 796 242 821 753 ...
## $ DistributionChannel : Factor w/ 4 levels "Corporate","Direct",..: 4 4 4 4 2 4 2 4 4 4 ...
## $ MarketSegment : Factor w/ 7 levels "Aviation","Complementary",..: 7 6 6 7 4 6 4 5 6 7 ...
## $ SRHighFloor : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ SRLowFloor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRAccessibleRoom : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRMediumFloor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRBathtub : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRShower : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRCrib : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ SRKingSizeBed : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
## $ SRTwinBed : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
## $ SRNearElevator : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRAwayFromElevator : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRNoAlcoholInMiniBar: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRQuietRoom : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. ID = col_double(),
## .. Nationality = col_character(),
## .. Age = col_character(),
## .. DaysSinceCreation = col_double(),
## .. NameHash = col_character(),
## .. DocIDHash = col_character(),
## .. AverageLeadTime = col_double(),
## .. LodgingRevenue = col_double(),
## .. OtherRevenue = col_double(),
## .. BookingsCanceled = col_double(),
## .. BookingsNoShowed = col_double(),
## .. BookingsCheckedIn = col_double(),
## .. PersonsNights = col_double(),
## .. RoomNights = col_double(),
## .. DaysSinceLastStay = col_double(),
## .. DaysSinceFirstStay = col_double(),
## .. DistributionChannel = col_character(),
## .. MarketSegment = col_character(),
## .. SRHighFloor = col_double(),
## .. SRLowFloor = col_double(),
## .. SRAccessibleRoom = col_double(),
## .. SRMediumFloor = col_double(),
## .. SRBathtub = col_double(),
## .. SRShower = col_double(),
## .. SRCrib = col_double(),
## .. SRKingSizeBed = col_double(),
## .. SRTwinBed = col_double(),
## .. SRNearElevator = col_double(),
## .. SRAwayFromElevator = col_double(),
## .. SRNoAlcoholInMiniBar = col_double(),
## .. SRQuietRoom = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(HotelLisbon_data)
## ...1 ID Nationality Age
## Min. : 1 Min. : 1 FRA :11170 Min. :-11.0
## 1st Qu.:18751 1st Qu.:20885 PRT :10365 1st Qu.: 34.0
## Median :37501 Median :41825 DEU : 9242 Median : 46.0
## Mean :37501 Mean :41823 GBR : 7750 Mean : 45.4
## 3rd Qu.:56250 3rd Qu.:62715 ESP : 4399 3rd Qu.: 56.0
## Max. :75000 Max. :83590 USA : 3076 Max. :114.0
## (Other):28998
## DaysSinceCreation NameHash DocIDHash AverageLeadTime
## Min. : 0.0 Length:75000 Length:75000 Min. : -1.00
## 1st Qu.: 177.0 Class :character Class :character 1st Qu.: 0.00
## Median : 396.0 Mode :character Mode :character Median : 29.00
## Mean : 453.4 Mean : 66.21
## 3rd Qu.: 723.0 3rd Qu.:103.00
## Max. :1095.0 Max. :588.00
##
## LodgingRevenue OtherRevenue BookingsCanceled BookingsNoShowed
## Min. : 0.0 Min. : 0.00 Min. :0.000000 Min. :0.0000000
## 1st Qu.: 59.0 1st Qu.: 2.00 1st Qu.:0.000000 1st Qu.:0.0000000
## Median : 234.0 Median : 38.50 Median :0.000000 Median :0.0000000
## Mean : 299.0 Mean : 67.49 Mean :0.001987 Mean :0.0005867
## 3rd Qu.: 403.2 3rd Qu.: 88.00 3rd Qu.:0.000000 3rd Qu.:0.0000000
## Max. :21781.0 Max. :5105.50 Max. :9.000000 Max. :3.0000000
##
## BookingsCheckedIn PersonsNights RoomNights DaysSinceLastStay
## Min. : 0.0000 Min. : 0.000 Min. : 0.000 Min. : -1.0
## 1st Qu.: 1.0000 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 26.0
## Median : 1.0000 Median : 4.000 Median : 2.000 Median : 366.0
## Mean : 0.7934 Mean : 4.647 Mean : 2.358 Mean : 400.9
## 3rd Qu.: 1.0000 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 694.0
## Max. :57.0000 Max. :116.000 Max. :185.000 Max. :1104.0
##
## DaysSinceFirstStay DistributionChannel
## Min. : -1.0 Corporate : 2340
## 1st Qu.: 27.0 Direct :10714
## Median : 369.0 Electronic Distribution: 456
## Mean : 403.1 Travel Agent/Operator :61490
## 3rd Qu.: 698.0
## Max. :1186.0
##
## MarketSegment SRHighFloor SRLowFloor SRAccessibleRoom
## Aviation : 221 0:71435 0:74897 0:74979
## Complementary : 453 1: 3565 1: 103 1: 21
## Corporate : 1939
## Direct :10312
## Groups : 8519
## Other :43046
## Travel Agent/Operator:10510
## SRMediumFloor SRBathtub SRShower SRCrib SRKingSizeBed SRTwinBed
## 0:74933 0:74784 0:74866 0:74006 0:48583 0:64277
## 1: 67 1: 216 1: 134 1: 994 1:26417 1:10723
##
##
##
##
##
## SRNearElevator SRAwayFromElevator SRNoAlcoholInMiniBar SRQuietRoom
## 0:74973 0:74729 0:74990 0:68398
## 1: 27 1: 271 1: 10 1: 6602
##
##
##
##
##
#Let’s create some Bar Charts and Histograms for further investigation
histogram_cols <- c("Age", "DaysSinceCreation", "AverageLeadTime", "LodgingRevenue", "OtherRevenue",
"BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn", "PersonsNights", "RoomNights",
"DaysSinceLastStay", "DaysSinceFirstStay", "Nationality", "DistributionChannel",
"MarketSegment", "SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor",
"SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed", "SRNearElevator",
"SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")
for (col in histogram_cols) {
if (is.numeric(HotelLisbon_data[[col]])) {
plot <- ggplot(HotelLisbon_data, aes_string(x = col)) +
geom_histogram(bins = 30, fill = 'blue', color = 'black') +
labs(title = paste("Histogram of", col), x = col, y = "Frequency") +
theme_minimal()
} else {
plot <- ggplot(HotelLisbon_data, aes_string(x = col)) +
geom_bar(fill = 'blue', color = 'black') +
labs(title = paste("Bar Chart of", col), x = col, y = "Count") +
theme_minimal()
}
print(plot)
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Let’s create further visualizations keeping in mind the variabkes are numerical and categorical
# Numeric Variables
numeric_cols <- c("Age", "AverageLeadTime", "LodgingRevenue", "OtherRevenue",
"BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn", "PersonsNights", "RoomNights",
"DaysSinceLastStay", "DaysSinceFirstStay")
# Create boxplots for numeric variables
for (col in numeric_cols) {
plot <- ggplot(HotelLisbon_data, aes(y = as.numeric(HotelLisbon_data[[col]]))) +
geom_boxplot(fill = 'blue', color = 'black') +
labs(title = paste("Boxplot of", col), y = col) +
theme_minimal()
# Highlight outliers
outliers <- boxplot.stats(HotelLisbon_data[[col]])$out
if (length(outliers) > 0) {
plot <- plot + geom_point(data = data.frame(y = outliers), aes(x = 1, y = y), color = 'red', size = 3)
}
print(plot)
}
# Categorical Variables
categorical_cols <- c("Nationality", "DistributionChannel", "MarketSegment", "SRHighFloor", "SRLowFloor",
"SRAccessibleRoom", "SRMediumFloor", "SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed",
"SRTwinBed", "SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")
# Create bar plots for categorical variables
for (col in categorical_cols) {
plot <- ggplot(HotelLisbon_data, aes_string(x = col)) +
geom_bar(fill = 'blue', color = 'black') +
labs(title = paste("Bar Chart of", col), x = col, y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(plot)
}
Our Business question is : #At Hotel Lisbon, how many customers are cancelling bookings”BookingsCanceled” and Not-showing up “BookingsNoShowed” through customer data habits recorded in Hotel Lisbon? And what must the Hotliers do to decrease the number of cancellations and No-shows?
#Now let’s create Box plots to investigate outliers
library(ggplot2)
library(dplyr)
all_columns <- names(HotelLisbon_data)
for (col in all_columns) {
if (is.numeric(HotelLisbon_data[[col]])) {
plot <- ggplot(HotelLisbon_data, aes_string(y = col)) +
geom_boxplot(fill = 'blue', color = 'black') +
labs(title = paste("Box Plot of", col), y = col, x = "") +
theme_minimal()
print(plot)
}
}
#by investigating the dataset, we have found out Age has some “Null” values and there are some negative values, which should be cleaned or further investigated.
#WE need to create a better categorical table for customer ID, so that we know if there are any repeated customers.
#Let’s see if BookingsCanceled and BookingsNoshowed are binary or not
# Check unique values for BookingsCanceled
unique_values_canceled <- unique(HotelLisbon_data$BookingsCanceled)
# Check unique values for BookingsNoShowed
unique_values_no_show <- unique(HotelLisbon_data$BookingsNoShowed)
# Check unique values for BookingsNoShowed
unique_values_checkedIn <- unique(HotelLisbon_data$BookingsCheckedIn)
# Print the results
cat("Unique values for BookingsCanceled:", unique_values_canceled, "\n")
## Unique values for BookingsCanceled: 0 2 1 4 3 9
cat("Unique values for BookingsNoShowed:", unique_values_no_show, "\n")
## Unique values for BookingsNoShowed: 0 1 2 3
cat("Unique values for BookingsCheckedIn:", unique_values_checkedIn, "\n")
## Unique values for BookingsCheckedIn: 1 0 2 3 8 4 7 12 10 5 34 6 13 9 11 29 14 57 19 15 20 40 23 26 25 17 18
# Check if they are binary
is_binary_canceled <- length(unique_values_canceled) == 2
is_binary_no_show <- length(unique_values_no_show) == 2
is_binary_checkedIn <- length(unique_values_checkedIn) == 2
# Print the results
cat("BookingsCanceled is binary:", is_binary_canceled, "\n")
## BookingsCanceled is binary: FALSE
cat("BookingsNoShowed is binary:", is_binary_no_show, "\n")
## BookingsNoShowed is binary: FALSE
cat("BookingsCheckedIN is binary:", is_binary_checkedIn, "\n")
## BookingsCheckedIN is binary: FALSE
summary(HotelLisbon_data$BookingsCanceled)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.001987 0.000000 9.000000
summary(HotelLisbon_data$BookingsNoShowed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000000 0.0000000 0.0000000 0.0005867 0.0000000 3.0000000
summary(HotelLisbon_data$BookingsCheckedIn)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.7934 1.0000 57.0000
# Check the class of variables
class_canceled <- class(HotelLisbon_data$BookingsCanceled)
class_checked_in <- class(HotelLisbon_data$BookingsCheckedIn)
class_no_show <- class(HotelLisbon_data$BookingsNoShowed)
# Print the results
cat("Class of BookingsCanceled:", class_canceled, "\n")
## Class of BookingsCanceled: numeric
cat("Class of BookingsCheckedIn:", class_checked_in, "\n")
## Class of BookingsCheckedIn: numeric
cat("Class of BookingsNoShowed:", class_no_show, "\n")
## Class of BookingsNoShowed: numeric
#Analyze ‘DaysSinceFirstStay’ and ‘DaysSinceLastStay’ to understand repeat customer behavior.
summary(HotelLisbon_data$DaysSinceFirstStay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0 27.0 369.0 403.1 698.0 1186.0
summary(HotelLisbon_data$DaysSinceLastStay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.0 26.0 366.0 400.9 694.0 1104.0
summary(HotelLisbon_data$AverageLeadTime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.00 0.00 29.00 66.21 103.00 588.00
summary(HotelLisbon_data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -11.0 34.0 46.0 45.4 56.0 114.0
#by looking at the data table, in some observations of DaysSinceLastStay and DaysSinceFirstStay the value written in -1 due to this AverageLeadTime is 0 at these observations. #how do we resolve this. ————————————————————————————————————————————– #By observing the data set, I can see that -1 written in DayssinceLastStay and DaysSinceFirstStay. This is are wrong. As the data dictioary says:
#DaysSinceLastStay
#The number of days elapsed between the last day of the extraction and the customer’s last arrival date (of a checked-in booking). A value of 1 indicates the customer never stayed at the hotel
#DaysSinceFirstStay
#the customer’s first arrival date (of a checked-in booking). A value of The number of days elapsed between the last day of the extraction and 1 indicates the customer never stayed at the hotel
This means it needs to be put as 1 and remove the negative.
#AverageLeadTime #The average number of days elapsed between the customer’s booking date and arrival date. In other words, this variable is calculated by dividing the sum of the number of days elapsed between the moment each booking was made and its arrival date, by the total of bookings made by the customer
#looking at BookingsCheckedIn, BookinksCanceled and BookingsNoshows of customers with -1 in DaysSinceFirstStay and DaysSinceLastStay, they are all 0, which is not right. If the customer did not show up, it means that BookingsCancled or No-Shows should say 1. But they are 0. this needs to corrected.
We need to make a new binary variable, from looking at No shows and Cancellations. and evaluate if they are indeed Binary.
We also have to shorten our collection of variables
#do we need Lodging Revenue and Other Revenue and transform them into log
#Filtering or Correcting Negative Values, Single digit in ‘Age’, ‘AverageLeadTime’, ‘DaysSinceLastStay’, and ’DaysSinceFirstStay’and removing negative sign from DaysSinceFisrtStay and DaysSinceLaststay so that BookingsCheckIn is 0 and BookingsNoShowed and BookingsCanceled is 1
# Convert Age to numeric
HotelLisbon_data$Age <- as.numeric(as.character(HotelLisbon_data$Age))
# Replace negative values and single-digit ages with NA
HotelLisbon_data$Age[HotelLisbon_data$Age < 0 | HotelLisbon_data$Age < 16] <- NA
# Replace missing values in Age with median
median_age <- median(HotelLisbon_data$Age, na.rm = TRUE)
HotelLisbon_data$Age[is.na(HotelLisbon_data$Age)] <- median_age
# Print the median age
cat("Median Age:", median_age, "\n")
## Median Age: 46
# Replace -1 in DaysSinceLastStay and DaysSinceFirstStay with 1
HotelLisbon_data$DaysSinceLastStay[HotelLisbon_data$DaysSinceLastStay == -1] <- 1
HotelLisbon_data$DaysSinceFirstStay[HotelLisbon_data$DaysSinceFirstStay == -1] <- 1
# Update BookingsCheckedIn, BookingsCanceled, and BookingsNoShowed
HotelLisbon_data$BookingsCheckedIn[HotelLisbon_data$DaysSinceFirstStay == 1 | HotelLisbon_data$DaysSinceLastStay == 1] <- 0
HotelLisbon_data$BookingsNoShowed[HotelLisbon_data$DaysSinceFirstStay == 1 | HotelLisbon_data$DaysSinceLastStay == 1] <- 1
HotelLisbon_data$BookingsCanceled[HotelLisbon_data$DaysSinceFirstStay == 1 | HotelLisbon_data$DaysSinceLastStay == 1] <- 1
# Check for missing values after replacement
sum(is.na(HotelLisbon_data$Age))
## [1] 0
summary(HotelLisbon_data$DaysSinceLastStay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 26.0 366.0 401.4 694.0 1104.0
summary(HotelLisbon_data$DaysSinceFirstStay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 27.0 369.0 403.6 698.0 1186.0
summary(HotelLisbon_data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 37.00 46.00 47.04 56.00 114.00
summary(HotelLisbon_data$BookingsCanceled)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2405 0.0000 9.0000
summary(HotelLisbon_data$BookingsNoShowed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2391 0.0000 3.0000
summary(HotelLisbon_data$BookingsCheckedIn)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 1.0000 1.0000 0.7933 1.0000 57.0000
#The negaitvity in these variables have been resolved. the single digit ages have also changed to above the age of 16. #BookingsCanceled and Booking No showed now have a value of 1 where instaed it was zero becuase of -1 written in DayssinceFirst Stay and Days Since last stay.
#Let’s check again the unique values of Bookings checked In, Bookings canceled and Bookings No showed.
# Check unique values for BookingsCanceled
New_unique_values_canceled <- unique(HotelLisbon_data$BookingsCanceled)
# Check unique values for BookingsNoShowed
New_unique_values_no_show <- unique(HotelLisbon_data$BookingsNoShowed)
# Check unique values for BookingsNoShowed
New_unique_values_checkedIn <- unique(HotelLisbon_data$BookingsCheckedIn)
# Print the results
cat("New Unique values for BookingsCanceled:", New_unique_values_canceled, "\n")
## New Unique values for BookingsCanceled: 0 1 4 3 2 9
cat("New Unique values for BookingsNoShowed:", New_unique_values_no_show, "\n")
## New Unique values for BookingsNoShowed: 0 1 2 3
cat("New Unique values for BookingsCheckedIn:", New_unique_values_checkedIn, "\n")
## New Unique values for BookingsCheckedIn: 1 0 2 3 8 4 7 12 10 5 34 6 13 9 11 29 14 57 19 15 20 40 23 26 25 17 18
#Let’s count each variables frequency for these unique values
# Count the frequency of each numeric value in BookingsCanceled
frequency_canceled <- table(HotelLisbon_data$BookingsCanceled)
# Count the frequency of each numeric value in BookingsNoShowed
frequency_no_show <- table(HotelLisbon_data$BookingsNoShowed)
# Count the frequency of each numeric value in BookingsCheckedIn
frequency_checked_in <- table(HotelLisbon_data$BookingsCheckedIn)
# Print the results
cat("Frequency of each numeric value in BookingsCanceled:\n", frequency_canceled, "\n\n")
## Frequency of each numeric value in BookingsCanceled:
## 57004 17976 9 7 3 1
cat("Frequency of each numeric value in BookingsNoShowed:\n", frequency_no_show, "\n\n")
## Frequency of each numeric value in BookingsNoShowed:
## 57074 17920 5 1
cat("Frequency of each numeric value in BookingsCheckedIn:\n", frequency_checked_in, "\n")
## Frequency of each numeric value in BookingsCheckedIn:
## 17889 55813 1017 121 57 18 17 16 9 7 6 6 4 4 2 2 1 1 1 1 1 1 1 2 1 1 1
# Count the frequency of each numeric value in BookingsCanceled
frequency_canceled <- as.data.frame(table(HotelLisbon_data$BookingsCanceled))
cat("Frequency of each numeric value in BookingsCanceled:\n")
## Frequency of each numeric value in BookingsCanceled:
for (i in 1:nrow(frequency_canceled)) {
cat(paste(frequency_canceled$Var1[i], ":", frequency_canceled$Freq[i]), "\n")
}
## 0 : 57004
## 1 : 17976
## 2 : 9
## 3 : 7
## 4 : 3
## 9 : 1
cat("\n")
# Count the frequency of each numeric value in BookingsNoShowed
frequency_no_show <- as.data.frame(table(HotelLisbon_data$BookingsNoShowed))
cat("Frequency of each numeric value in BookingsNoShowed:\n")
## Frequency of each numeric value in BookingsNoShowed:
for (i in 1:nrow(frequency_no_show)) {
cat(paste(frequency_no_show$Var1[i], ":", frequency_no_show$Freq[i]), "\n")
}
## 0 : 57074
## 1 : 17920
## 2 : 5
## 3 : 1
cat("\n")
# Count the frequency of each numeric value in BookingsCheckedIn
frequency_checked_in <- as.data.frame(table(HotelLisbon_data$BookingsCheckedIn))
cat("Frequency of each numeric value in BookingsCheckedIn:\n")
## Frequency of each numeric value in BookingsCheckedIn:
for (i in 1:nrow(frequency_checked_in)) {
cat(paste(frequency_checked_in$Var1[i], ":", frequency_checked_in$Freq[i]), "\n")
}
## 0 : 17889
## 1 : 55813
## 2 : 1017
## 3 : 121
## 4 : 57
## 5 : 18
## 6 : 17
## 7 : 16
## 8 : 9
## 9 : 7
## 10 : 6
## 11 : 6
## 12 : 4
## 13 : 4
## 14 : 2
## 15 : 2
## 17 : 1
## 18 : 1
## 19 : 1
## 20 : 1
## 23 : 1
## 25 : 1
## 26 : 1
## 29 : 2
## 34 : 1
## 40 : 1
## 57 : 1
#BookingsCanceled:
The majority of customers (57,004 out of 75,000) did not cancel their bookings (0). A significant number of customers (17,976) canceled their bookings once (1). There are a few cases where customers canceled multiple bookings (2, 3, 4, 9 times).
#BookingsNoShowed:
The majority of customers (57,074 out of 75,000) did not have a “no-show” (0). A significant number of customers (17,920) had one “no-show” incident (1). There are a few cases where customers had multiple “no-show” incidents (2, 3).
#BookingsCheckedIn:
A large number of customers (55,813 out of 75,000) checked in once (1). There are customers who checked in multiple times, with various frequencies. Based on the data dictionary and the understanding of the variables: BookingsCheckedIn: 0: 17,889 customers did not check in for any bookings. 1: 55,813 customers checked in once. 2: 1,017 customers checked in twice. 3: 121 customers checked in thrice. 4: 57 customers checked in four times. 5-15: There is a decreasing trend in the number of customers who checked in for more bookings, with decreasing frequency. 17-40: The frequency further decreases for customers who checked in for a higher number of bookings. 57: There is one customer who checked in 57 times. Analysis:
The majority of customers (55,813) checked in for just one booking, indicating that a significant portion of customers had a single stay event. The number of customers decreases as the number of check-ins increases, suggesting that fewer customers have multiple stay events. There are outliers or rare cases where a small number of customers have a very high number of check-ins, which might be unusual or require further investigation. This variable provides insights into customer behavior regarding the frequency of stays, with the majority having a limited number of stay
Let’s create new categories for Bookings checked In, Bookings Canceled and Bookings No show, so that we have less categoires and a more concise description for these customers.
# Creating a function to categorize BookingsCheckedIn
categorize_checked_in <- function(count) {
if (count == 0) {
return("Booked but Not Checked In")
} else if (count == 1) {
return("Stayed Once")
} else if (count == 2) {
return("Stayed Twice")
} else if (count == 3) {
return("Stayed Thrice")
} else if (count >= 4 & count <= 15) {
return("Good Loyal Customers")
} else if (count >= 17 & count <= 57) {
return("Great Loyal Customers")
} else {
return("Undefined Category")
}
}
# creating a new variable for BookingsCheckedIn
HotelLisbon_data$CheckedInCategory <- sapply(HotelLisbon_data$BookingsCheckedIn, categorize_checked_in)
# Now a function to categorize BookingsCanceled
categorize_canceled <- function(count) {
if (count == 0) {
return("Checked In and Stayed")
} else if (count >= 1 & count <= 9) {
return("Canceled Bookings")
} else {
return("Undefined Category")
}
}
# creating a new variable for BookingsCanceled
HotelLisbon_data$CanceledCategory <- sapply(HotelLisbon_data$BookingsCanceled, categorize_canceled)
# And then a function to categorize BookingsNoShowed
categorize_no_show <- function(count) {
if (count == 0) {
return("Customers Showed up and Stayed")
} else if (count >= 1 & count <= 3) {
return("No Show Customers")
} else {
return("Undefined Category")
}
}
# creating a new variable for BookingsNoShowed
HotelLisbon_data$NoShowCategory <- sapply(HotelLisbon_data$BookingsNoShowed, categorize_no_show)
# Viewing the results
head(HotelLisbon_data[, c("CheckedInCategory", "CanceledCategory", "NoShowCategory")])
## # A tibble: 6 × 3
## CheckedInCategory CanceledCategory NoShowCategory
## <chr> <chr> <chr>
## 1 Stayed Once Checked In and Stayed Customers Showed up and Stayed
## 2 Stayed Once Checked In and Stayed Customers Showed up and Stayed
## 3 Stayed Once Checked In and Stayed Customers Showed up and Stayed
## 4 Stayed Once Checked In and Stayed Customers Showed up and Stayed
## 5 Stayed Once Checked In and Stayed Customers Showed up and Stayed
## 6 Booked but Not Checked In Canceled Bookings No Show Customers
# Frequency of each customer in CheckedInCategory
frequency_checked_in_category <- as.data.frame(table(HotelLisbon_data$CheckedInCategory))
# Frequency of each customer in CanceledCategory
frequency_canceled_category <- as.data.frame(table(HotelLisbon_data$CanceledCategory))
# Frequency of each customer in NoShowCategory
frequency_no_show_category <- as.data.frame(table(HotelLisbon_data$NoShowCategory))
# View the results
print("Frequency of each customer in CheckedInCategory:")
## [1] "Frequency of each customer in CheckedInCategory:"
print(frequency_checked_in_category)
## Var1 Freq
## 1 Booked but Not Checked In 17889
## 2 Good Loyal Customers 148
## 3 Great Loyal Customers 12
## 4 Stayed Once 55813
## 5 Stayed Thrice 121
## 6 Stayed Twice 1017
cat("\n")
print("Frequency of each customer in CanceledCategory:")
## [1] "Frequency of each customer in CanceledCategory:"
print(frequency_canceled_category)
## Var1 Freq
## 1 Canceled Bookings 17996
## 2 Checked In and Stayed 57004
cat("\n")
print("Frequency of each customer in NoShowCategory:")
## [1] "Frequency of each customer in NoShowCategory:"
print(frequency_no_show_category)
## Var1 Freq
## 1 Customers Showed up and Stayed 57074
## 2 No Show Customers 17926
#we have created three new variables that are characteristic in nature for making the readings of Booking frequency more presentable.
Now we have clearly categorized the customers Booking, cancel and Now Show data.
#Let’s make a new Binary variable called CustomerOutsome from variables BookingsCheckedIn, BookingsCanceled and BookingsNoShowed
# Create a new variable CustomerOutcome
HotelLisbon_data$CustomerOutcome <- ifelse(HotelLisbon_data$BookingsCanceled > 0 | HotelLisbon_data$BookingsNoShowed > 0, 0, 1)
# Display the unique values of CustomerOutcome
unique(HotelLisbon_data$CustomerOutcome)
## [1] 1 0
table(HotelLisbon_data$CustomerOutcome)
##
## 0 1
## 18020 56980
We have succecfully created binary variable “CustomerOutcome” #It shows that 18020 customers did not show up or canceled thier resevartions # 56980 customers booked and stayed at the Hotel
#Data dimesion:
ncol(HotelLisbon_data)
## [1] 36
colnames(HotelLisbon_data)
## [1] "...1" "ID" "Nationality"
## [4] "Age" "DaysSinceCreation" "NameHash"
## [7] "DocIDHash" "AverageLeadTime" "LodgingRevenue"
## [10] "OtherRevenue" "BookingsCanceled" "BookingsNoShowed"
## [13] "BookingsCheckedIn" "PersonsNights" "RoomNights"
## [16] "DaysSinceLastStay" "DaysSinceFirstStay" "DistributionChannel"
## [19] "MarketSegment" "SRHighFloor" "SRLowFloor"
## [22] "SRAccessibleRoom" "SRMediumFloor" "SRBathtub"
## [25] "SRShower" "SRCrib" "SRKingSizeBed"
## [28] "SRTwinBed" "SRNearElevator" "SRAwayFromElevator"
## [31] "SRNoAlcoholInMiniBar" "SRQuietRoom" "CheckedInCategory"
## [34] "CanceledCategory" "NoShowCategory" "CustomerOutcome"
sapply(HotelLisbon_data, class)
## ...1 ID Nationality
## "numeric" "numeric" "factor"
## Age DaysSinceCreation NameHash
## "numeric" "numeric" "character"
## DocIDHash AverageLeadTime LodgingRevenue
## "character" "numeric" "numeric"
## OtherRevenue BookingsCanceled BookingsNoShowed
## "numeric" "numeric" "numeric"
## BookingsCheckedIn PersonsNights RoomNights
## "numeric" "numeric" "numeric"
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## "numeric" "numeric" "factor"
## MarketSegment SRHighFloor SRLowFloor
## "factor" "factor" "factor"
## SRAccessibleRoom SRMediumFloor SRBathtub
## "factor" "factor" "factor"
## SRShower SRCrib SRKingSizeBed
## "factor" "factor" "factor"
## SRTwinBed SRNearElevator SRAwayFromElevator
## "factor" "factor" "factor"
## SRNoAlcoholInMiniBar SRQuietRoom CheckedInCategory
## "factor" "factor" "character"
## CanceledCategory NoShowCategory CustomerOutcome
## "character" "character" "numeric"
summary(HotelLisbon_data$Nationality)
## FRA PRT DEU GBR ESP USA ITA BEL BRA NLD
## 11170 10365 9242 7750 4399 3076 3007 2806 2564 2461
## CHE IRL CAN AUT SWE CHN ISR NOR POL AUS
## 1913 1780 1364 1334 1119 816 802 708 662 638
## DNK FIN RUS ROU HUN JPN CZE GRC LUX IND
## 594 589 529 432 289 245 224 216 205 185
## KOR AGO MEX MAR ARG BGR TUR SRB HRV EST
## 176 162 156 155 154 143 141 136 127 112
## UKR NZL LVA MOZ DZA IRN TWN SVK CHL COL
## 88 84 83 70 69 68 67 66 65 62
## LTU ZAF SVN CYP ISL PHL AZE SGP PER THA
## 60 55 54 52 51 46 43 39 37 35
## MLT LBN URY SAU BLR ARE VNM TUN VEN CMR
## 33 31 31 29 28 27 27 25 23 22
## BIH ECU EGY IDN MYS KAZ PAN JOR MKD CRI
## 21 20 20 18 18 16 16 15 14 13
## DOM NGA ALB PAK KWT ARM CPV GNB IRQ KEN
## 13 13 12 12 11 10 10 10 10 10
## PRY BOL GEO ATF BHR CUB LKA AND BGD (Other)
## 10 9 9 8 8 8 8 7 7 228
#Nationalties is a large data oc countires. Lets’ group these countries into continents
install.packages("countrycode")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'countrycode' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(countrycode)
## Warning: package 'countrycode' was built under R version 4.3.2
# Load the dplyr package for data manipulation
library(dplyr)
# Create a named vector for continent mapping
continent_mapping <- c(
BRA = 'SA', CAN = 'NA', PHL = 'AS', FRA = 'EU', HUN = 'EU',
ITA = 'EU', EST = 'EU', BEL = 'EU', PRT = 'EU', GBR = 'EU',
USA = 'NA', CHN = 'AS', IRL = 'EU', DEU = 'EU', CHE = 'EU',
AUS = 'OC', ESP = 'EU', RUS = 'EU', AUT = 'EU', ROU = 'EU',
NLD = 'EU', TUR = 'EU', ARG = 'SA', CZE = 'EU', CYP = 'EU',
FIN = 'EU', POL = 'EU', NOR = 'EU', JPN = 'AS', NZL = 'OC',
CAF = 'AF', SRB = 'EU', BGR = 'EU', SYR = 'AS', UKR = 'EU',
VEN = 'SA', SWE = 'EU', ISR = 'AS', DNK = 'EU', URY = 'SA',
MAR = 'AF', AGO = 'AF', IND = 'AS', CHL = 'SA', BHR = 'AS',
MEX = 'NA', THA = 'AS', PAK = 'AS', IDN = 'AS', HRV = 'EU',
LUX = 'EU', ARE = 'AS', TUN = 'AF', LVA = 'EU', UZB = 'AS',
TGO = 'AF', SGP = 'AS', BLR = 'EU', GRC = 'EU', ARM = 'AS',
DZA = 'AF', GNB = 'AF', SVK = 'EU', CRI = 'NA', EGY = 'AF',
DOM = 'NA', IRN = 'AS', SVN = 'EU', ZAF = 'AF', MKD = 'EU',
HKG = 'AS', ISL = 'EU', MDV = 'AS', MOZ = 'AF', IRQ = 'AS',
MYS = 'AS', LCA = 'NA', KOR = 'AS', SUR = 'SA', LTU = 'EU',
PRY = 'SA', BOL = 'SA', TWN = 'AS', STP = 'AF', MMR = 'AS',
CMR = 'AF', SAU = 'AS', KWT = 'AS', AZE = 'AS', JEY = 'EU',
COL = 'SA', ALB = 'EU', PAN = 'NA', LBN = 'AS', ECU = 'SA',
NGA = 'AF', MUS = 'AF', MLT = 'EU', BIH = 'EU', KAZ = 'AS',
LBY = 'AF', CPV = 'AF', LKA = 'AS', ATA = 'AN', PER = 'SA',
CIV = 'AF', VNM = 'AS', MNE = 'EU', COM = 'AF', OMN = 'AS',
QAT = 'AS', GAB = 'AF', COD = 'AF', BGD = 'AS', NAM = 'AF',
HTI = 'NA', GEO = 'AS', GIB = 'EU', JOR = 'AS', SYC = 'AF',
LIE = 'EU', SEN = 'AF', ATF = 'AN', KEN = 'AF', VIR = 'NA',
PYF = 'OC', UGA = 'AF', TZA = 'AF', SMR = 'EU', KGZ = 'AS',
PRI = 'NA', NCL = 'OC', BWA = 'AF', GTM = 'NA', BRB = 'NA',
MWI = 'AF', NIC = 'NA', LAO = 'AS', MLI = 'AF', RWA = 'AF',
ASM = 'OC', DMA = 'NA', MRT = 'AF', AIA = 'NA', CUB = 'NA',
SDN = 'AF', JAM = 'NA', TKM = 'AS', SWZ = 'AF', MCO = 'EU',
WSM = 'OC', AND = 'EU', KNA = 'NA', ERI = 'AF', BEN = 'AF',
SLV = 'NA', GUF = 'SA', ABW = 'NA', FRO = 'EU', ZWE = 'AF',
ATG = 'NA', SLE = 'AF', GUY = 'SA', TCD = 'AF', FLK = 'SA',
SPM = 'NA', SOM = 'AF', GHA = 'AF', UMI = 'OC', TJK = 'AS',
ETH = 'AF', KIR = 'OC', PCN = 'OC', LAO = 'AS', MNG = 'AS',
BTN = 'AS', MHL = 'OC', KI = 'OC', VUT = 'OC', TLS = 'AS',
FJI = 'OC', COK = 'OC', NRU = 'OC', TUV = 'OC', SLB = 'OC',
MNP = 'OC', FSM = 'OC', PLW = 'OC', MAF = 'NA', SXM = 'NA',
CUW = 'NA', BES = 'NA', ABW = 'NA', SGS = 'AN', BVT = 'AN',
CXR = 'AS', CCK = 'AS', HMD = 'AN', NFK = 'OC', ATA = 'AN'
)
# Convert Nationality to character type
HotelLisbon_data$Nationality <- as.character(HotelLisbon_data$Nationality)
# Add a new column 'Continent' to the dataset
HotelLisbon_data <- HotelLisbon_data %>%
mutate(Continent = continent_mapping[Nationality])
# Check for any countries that were not categorized
missing_continents <- unique(HotelLisbon_data$Nationality[is.na(HotelLisbon_data$Continent)])
# Print missing continents to see which countries need to be categorized
print(missing_continents)
## [1] "TON" "IOT" "WLF" "BHS" "MDG" "GNQ" "YEM" "PNG" "VCT" "NPL" "GIN"
# Add a new column 'Continent' to the dataset
HotelLisbon_data <- HotelLisbon_data %>%
mutate(Continent = continent_mapping[Nationality])
# Manually assign continents for the specified countries
manual_continents <- c(
"TON" = "AF",
"IOT" = "AS",
"WLF" = "OC",
"BHS" = "NA",
"MDG" = "AF",
"GNQ" = "AF",
"YEM" = "AS",
"PNG" = "OC",
"VCT" = "NA",
"NPL" = "AS",
"GIN" = "AF"
)
# Update the Continent column for the manually assigned countries
HotelLisbon_data <- HotelLisbon_data %>%
mutate(Continent = ifelse(Nationality %in% names(manual_continents), manual_continents[Nationality], Continent))
# Check for any countries that were not categorized
missing_continents <- unique(HotelLisbon_data$Nationality[is.na(HotelLisbon_data$Continent)])
# Print missing continents to see which countries need to be categorized
print(missing_continents)
## character(0)
We have managed to manually input all countries into a new variable called continent.
# Convert 'Continent' column to a factor
HotelLisbon_data$Continent <- as.factor(HotelLisbon_data$Continent)
# Check the structure of the dataset to confirm the changes
str(HotelLisbon_data)
## tibble [75,000 × 37] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:75000] 1 2 3 4 5 6 7 8 9 10 ...
## $ ID : num [1:75000] 20351 62663 30398 39784 17929 ...
## $ Nationality : chr [1:75000] "BRA" "CAN" "PHL" "FRA" ...
## $ Age : num [1:75000] 85 30 70 31 29 46 49 55 53 47 ...
## $ DaysSinceCreation : num [1:75000] 733 178 564 430 785 314 794 237 817 750 ...
## $ NameHash : chr [1:75000] "0x0BF2ECC3BF14F7FF3F926275E9BAAFAFF5823E69F81319DCCC5867DC986E10DC" "0xE4899D5F1CF2354CE1EBCD1717CE2EC2D91DE694C9118ADA37CB726A3F43DE22" "0xA1D0401C1635B389B99596B21D6C463B6E63444457B010DDD3D31AC1CE19C2ED" "0x0C948619213E11A1EB2E326CA64277BAFADDDDE51D7D2EB9DA7B71295C8704BC" ...
## $ DocIDHash : chr [1:75000] "0x44749B4F7510099B0A4BEF85DE72E75ABD3CC90896949AAC4EF1A46598DCE490" "0x78C451F6556F7129351AE28B3BA7DD499E258DDBEC31F89302AE62899301DB4A" "0x613A9E9859B7CA68E8D3613BE4B2880059B2A5134E2B2B8C85186EAC073A3AC8" "0x56D89BC906AA74D89F63D75436A2BBC0B2DE9EAD2D49CEA970713BD02290AE54" ...
## $ AverageLeadTime : num [1:75000] 41 119 94 47 148 0 33 230 213 157 ...
## $ LodgingRevenue : num [1:75000] 53 1041 1512 219 269 ...
## $ OtherRevenue : num [1:75000] 14 162 72 146 58.5 ...
## $ BookingsCanceled : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
## $ BookingsNoShowed : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
## $ BookingsCheckedIn : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
## $ PersonsNights : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
## $ RoomNights : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
## $ DaysSinceLastStay : num [1:75000] 734 181 568 433 789 1 796 242 821 753 ...
## $ DaysSinceFirstStay : num [1:75000] 734 181 568 433 789 1 796 242 821 753 ...
## $ DistributionChannel : Factor w/ 4 levels "Corporate","Direct",..: 4 4 4 4 2 4 2 4 4 4 ...
## $ MarketSegment : Factor w/ 7 levels "Aviation","Complementary",..: 7 6 6 7 4 6 4 5 6 7 ...
## $ SRHighFloor : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ SRLowFloor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRAccessibleRoom : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRMediumFloor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRBathtub : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRShower : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRCrib : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ SRKingSizeBed : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
## $ SRTwinBed : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
## $ SRNearElevator : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRAwayFromElevator : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRNoAlcoholInMiniBar: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRQuietRoom : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
## $ CheckedInCategory : chr [1:75000] "Stayed Once" "Stayed Once" "Stayed Once" "Stayed Once" ...
## $ CanceledCategory : chr [1:75000] "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" ...
## $ NoShowCategory : chr [1:75000] "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" ...
## $ CustomerOutcome : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
## $ Continent : Factor w/ 7 levels "AF","AN","AS",..: 7 5 3 4 4 4 4 4 4 4 ...
# Check the class of each variable
variable_classes <- sapply(HotelLisbon_data, class)
# Print the result
print(variable_classes)
## ...1 ID Nationality
## "numeric" "numeric" "character"
## Age DaysSinceCreation NameHash
## "numeric" "numeric" "character"
## DocIDHash AverageLeadTime LodgingRevenue
## "character" "numeric" "numeric"
## OtherRevenue BookingsCanceled BookingsNoShowed
## "numeric" "numeric" "numeric"
## BookingsCheckedIn PersonsNights RoomNights
## "numeric" "numeric" "numeric"
## DaysSinceLastStay DaysSinceFirstStay DistributionChannel
## "numeric" "numeric" "factor"
## MarketSegment SRHighFloor SRLowFloor
## "factor" "factor" "factor"
## SRAccessibleRoom SRMediumFloor SRBathtub
## "factor" "factor" "factor"
## SRShower SRCrib SRKingSizeBed
## "factor" "factor" "factor"
## SRTwinBed SRNearElevator SRAwayFromElevator
## "factor" "factor" "factor"
## SRNoAlcoholInMiniBar SRQuietRoom CheckedInCategory
## "factor" "factor" "character"
## CanceledCategory NoShowCategory CustomerOutcome
## "character" "character" "numeric"
## Continent
## "factor"
#Dropping variables
# Dropping specific variables
HotelLisbon_data <- HotelLisbon_data %>%
select(-c(...1, Nationality, DaysSinceCreation, NameHash, DocIDHash, AverageLeadTime, DaysSinceLastStay, DaysSinceFirstStay))
str(HotelLisbon_data)
## tibble [75,000 × 29] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:75000] 20351 62663 30398 39784 17929 ...
## $ Age : num [1:75000] 85 30 70 31 29 46 49 55 53 47 ...
## $ LodgingRevenue : num [1:75000] 53 1041 1512 219 269 ...
## $ OtherRevenue : num [1:75000] 14 162 72 146 58.5 ...
## $ BookingsCanceled : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
## $ BookingsNoShowed : num [1:75000] 0 0 0 0 0 1 0 0 0 0 ...
## $ BookingsCheckedIn : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
## $ PersonsNights : num [1:75000] 2 6 8 6 8 0 2 5 8 6 ...
## $ RoomNights : num [1:75000] 1 3 4 3 4 0 2 5 4 3 ...
## $ DistributionChannel : Factor w/ 4 levels "Corporate","Direct",..: 4 4 4 4 2 4 2 4 4 4 ...
## $ MarketSegment : Factor w/ 7 levels "Aviation","Complementary",..: 7 6 6 7 4 6 4 5 6 7 ...
## $ SRHighFloor : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ SRLowFloor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRAccessibleRoom : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRMediumFloor : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRBathtub : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRShower : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRCrib : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ SRKingSizeBed : Factor w/ 2 levels "0","1": 1 1 1 1 1 2 1 1 2 1 ...
## $ SRTwinBed : Factor w/ 2 levels "0","1": 2 1 1 1 1 1 1 1 1 1 ...
## $ SRNearElevator : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRAwayFromElevator : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRNoAlcoholInMiniBar: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ SRQuietRoom : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 2 1 ...
## $ CheckedInCategory : chr [1:75000] "Stayed Once" "Stayed Once" "Stayed Once" "Stayed Once" ...
## $ CanceledCategory : chr [1:75000] "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" "Checked In and Stayed" ...
## $ NoShowCategory : chr [1:75000] "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" "Customers Showed up and Stayed" ...
## $ CustomerOutcome : num [1:75000] 1 1 1 1 1 0 1 1 1 1 ...
## $ Continent : Factor w/ 7 levels "AF","AN","AS",..: 7 5 3 4 4 4 4 4 4 4 ...
#Let’s change the variables into specific class we want
# Convert ID to character
HotelLisbon_data$ID <- as.character(HotelLisbon_data$ID)
# Convert BookingsCanceled, BookingsNoShowed, BookingsCheckedIn, CheckedInCategory,
# CanceledCategory, NoShowCategory, and CustomerOutcome to factors
vars_to_factor <- c("BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn",
"CheckedInCategory", "CanceledCategory", "NoShowCategory", "CustomerOutcome")
HotelLisbon_data[vars_to_factor] <- lapply(HotelLisbon_data[vars_to_factor], as.factor)
# Display the class of the variables
sapply(HotelLisbon_data, class)
## ID Age LodgingRevenue
## "character" "numeric" "numeric"
## OtherRevenue BookingsCanceled BookingsNoShowed
## "numeric" "factor" "factor"
## BookingsCheckedIn PersonsNights RoomNights
## "factor" "numeric" "numeric"
## DistributionChannel MarketSegment SRHighFloor
## "factor" "factor" "factor"
## SRLowFloor SRAccessibleRoom SRMediumFloor
## "factor" "factor" "factor"
## SRBathtub SRShower SRCrib
## "factor" "factor" "factor"
## SRKingSizeBed SRTwinBed SRNearElevator
## "factor" "factor" "factor"
## SRAwayFromElevator SRNoAlcoholInMiniBar SRQuietRoom
## "factor" "factor" "factor"
## CheckedInCategory CanceledCategory NoShowCategory
## "factor" "factor" "factor"
## CustomerOutcome Continent
## "factor" "factor"
#correlation table for numeric values.
# Selected numeric variables
numeric_variables <- HotelLisbon_data[, sapply(HotelLisbon_data, is.numeric)]
# Create a correlation matrix
correlation_matrix <- cor(numeric_variables)
# Print the correlation matrix
print(correlation_matrix)
## Age LodgingRevenue OtherRevenue PersonsNights
## Age 1.000000000 -0.001258176 0.08892854 0.04338341
## LodgingRevenue -0.001258176 1.000000000 0.53779318 0.64978685
## OtherRevenue 0.088928543 0.537793184 1.00000000 0.53952786
## PersonsNights 0.043383409 0.649786850 0.53952786 1.00000000
## RoomNights 0.051863488 0.690830179 0.48239974 0.84794739
## RoomNights
## Age 0.05186349
## LodgingRevenue 0.69083018
## OtherRevenue 0.48239974
## PersonsNights 0.84794739
## RoomNights 1.00000000
#let’s create pivot tables to explore the relationship between “CustomerOutcome” and the variables: “DistributionChannel,” “MarketSegment,” and “Continent.”
#Pivot table for DistributionChannel and CustomerOutcome:
pivot_table_distribution <- table(HotelLisbon_data$DistributionChannel, HotelLisbon_data$CustomerOutcome)
# Display the pivot table
pivot_table_distribution
##
## 0 1
## Corporate 304 2036
## Direct 3051 7663
## Electronic Distribution 17 439
## Travel Agent/Operator 14648 46842
# Pivot table for MarketSegment and CustomerOutcome
pivot_table_market <- table(HotelLisbon_data$MarketSegment, HotelLisbon_data$CustomerOutcome)
# Display the pivot table
pivot_table_market
##
## 0 1
## Aviation 24 197
## Complementary 107 346
## Corporate 238 1701
## Direct 2972 7340
## Groups 1471 7048
## Other 11045 32001
## Travel Agent/Operator 2163 8347
# Pivot table for Continent and CustomerOutcome
pivot_table_continent <- table(HotelLisbon_data$Continent, HotelLisbon_data$CustomerOutcome)
# Display the pivot table
pivot_table_continent
##
## 0 1
## AF 164 560
## AN 4 6
## AS 709 2106
## EU 14644 48400
## NA 1440 3249
## OC 208 527
## SA 851 2132
#ANOVA Tests for Numerical Variables: Age, LodgingRevenue, OtherRevenue, PersonsNights, RoomNights
# ANOVA for Age
age_anova <- aov(Age ~ CustomerOutcome, data = HotelLisbon_data)
summary(age_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## CustomerOutcome 1 67013 67013 334.1 <2e-16 ***
## Residuals 74998 15042084 201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for LodgingRevenue
LodgingRevenue_anova <- aov(LodgingRevenue ~ CustomerOutcome, data = HotelLisbon_data)
summary(LodgingRevenue_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## CustomerOutcome 1 2.021e+09 2.021e+09 17898 <2e-16 ***
## Residuals 74998 8.469e+09 1.129e+05
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for OtherRevenue
OtherRevenue_anova <- aov(OtherRevenue ~ CustomerOutcome, data = HotelLisbon_data)
summary(OtherRevenue_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## CustomerOutcome 1 104542978 104542978 9641 <2e-16 ***
## Residuals 74998 813288511 10844
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for PersonsNights
PersonsNights_anova <- aov(PersonsNights ~ CustomerOutcome, data = HotelLisbon_data)
summary(PersonsNights_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## CustomerOutcome 1 493983 493983 34844 <2e-16 ***
## Residuals 74998 1063257 14
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA for RoomNights
RoomNights_anova <- aov(RoomNights ~ CustomerOutcome, data = HotelLisbon_data)
summary(RoomNights_anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## CustomerOutcome 1 123150 123150 34787 <2e-16 ***
## Residuals 74998 265498 4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Chi-Square Tests for Categorical Variables:
# Chi-square test for DistributionChannel
distribution_chi2 <- chisq.test(table(HotelLisbon_data$DistributionChannel, HotelLisbon_data$CustomerOutcome))
print(distribution_chi2)
##
## Pearson's Chi-squared test
##
## data: table(HotelLisbon_data$DistributionChannel, HotelLisbon_data$CustomerOutcome)
## X-squared = 376.69, df = 3, p-value < 2.2e-16
# Chi-square test for MarketSegment
segment_chi2 <- chisq.test(table(HotelLisbon_data$MarketSegment, HotelLisbon_data$CustomerOutcome))
print(segment_chi2)
##
## Pearson's Chi-squared test
##
## data: table(HotelLisbon_data$MarketSegment, HotelLisbon_data$CustomerOutcome)
## X-squared = 642, df = 6, p-value < 2.2e-16
# Chi-square test for Continent
continent_chi2 <- chisq.test(table(HotelLisbon_data$Continent, HotelLisbon_data$CustomerOutcome))
## Warning in chisq.test(table(HotelLisbon_data$Continent,
## HotelLisbon_data$CustomerOutcome)): Chi-squared approximation may be incorrect
print(continent_chi2)
##
## Pearson's Chi-squared test
##
## data: table(HotelLisbon_data$Continent, HotelLisbon_data$CustomerOutcome)
## X-squared = 181.45, df = 6, p-value < 2.2e-16
colnames(HotelLisbon_data)
## [1] "ID" "Age" "LodgingRevenue"
## [4] "OtherRevenue" "BookingsCanceled" "BookingsNoShowed"
## [7] "BookingsCheckedIn" "PersonsNights" "RoomNights"
## [10] "DistributionChannel" "MarketSegment" "SRHighFloor"
## [13] "SRLowFloor" "SRAccessibleRoom" "SRMediumFloor"
## [16] "SRBathtub" "SRShower" "SRCrib"
## [19] "SRKingSizeBed" "SRTwinBed" "SRNearElevator"
## [22] "SRAwayFromElevator" "SRNoAlcoholInMiniBar" "SRQuietRoom"
## [25] "CheckedInCategory" "CanceledCategory" "NoShowCategory"
## [28] "CustomerOutcome" "Continent"
#Let’s confirm if the Boolean viariables are infact variables:
binary_vars <- c("SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor",
"SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed",
"SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")
for (var in binary_vars) {
result <- table(HotelLisbon_data[[var]])
print(paste("Variable:", var))
print(result)
if (length(unique(HotelLisbon_data[[var]])) == 2) {
print("This is a binary variable.")
} else {
print("This is not a binary variable.")
}
cat("\n")
}
## [1] "Variable: SRHighFloor"
##
## 0 1
## 71435 3565
## [1] "This is a binary variable."
##
## [1] "Variable: SRLowFloor"
##
## 0 1
## 74897 103
## [1] "This is a binary variable."
##
## [1] "Variable: SRAccessibleRoom"
##
## 0 1
## 74979 21
## [1] "This is a binary variable."
##
## [1] "Variable: SRMediumFloor"
##
## 0 1
## 74933 67
## [1] "This is a binary variable."
##
## [1] "Variable: SRBathtub"
##
## 0 1
## 74784 216
## [1] "This is a binary variable."
##
## [1] "Variable: SRShower"
##
## 0 1
## 74866 134
## [1] "This is a binary variable."
##
## [1] "Variable: SRCrib"
##
## 0 1
## 74006 994
## [1] "This is a binary variable."
##
## [1] "Variable: SRKingSizeBed"
##
## 0 1
## 48583 26417
## [1] "This is a binary variable."
##
## [1] "Variable: SRTwinBed"
##
## 0 1
## 64277 10723
## [1] "This is a binary variable."
##
## [1] "Variable: SRNearElevator"
##
## 0 1
## 74973 27
## [1] "This is a binary variable."
##
## [1] "Variable: SRAwayFromElevator"
##
## 0 1
## 74729 271
## [1] "This is a binary variable."
##
## [1] "Variable: SRNoAlcoholInMiniBar"
##
## 0 1
## 74990 10
## [1] "This is a binary variable."
##
## [1] "Variable: SRQuietRoom"
##
## 0 1
## 68398 6602
## [1] "This is a binary variable."
Visualisations:
# Load ggplot2 library
library(ggplot2)
# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")
# Box plots for numerical variables
numerical_vars <- c("Age", "LodgingRevenue", "OtherRevenue", "PersonsNights", "RoomNights")
for (var in numerical_vars) {
p <- ggplot(HotelLisbon_data, aes(x = CustomerOutcome, y = get(var), fill = factor(CustomerOutcome))) +
geom_boxplot() +
labs(title = paste("Box plot for", var, "vs CustomerOutcome"),
x = "CustomerOutcome",
y = var) +
scale_fill_manual(values = colors) +
theme_minimal() +
theme(legend.position = "none")
print(p)
}
# Bar plots for categorical variables
categorical_vars <- c("BookingsCanceled", "BookingsNoShowed", "BookingsCheckedIn",
"DistributionChannel", "MarketSegment", "Continent")
for (var in categorical_vars) {
p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
geom_bar(position = "dodge") +
labs(title = paste("Bar plot for", var, "vs CustomerOutcome"),
x = var,
y = "Count") +
scale_fill_manual(values = colors) +
theme_minimal()
print(p)
}
# Grouped box plots for mixed variables
mixed_vars <- c("CheckedInCategory", "CanceledCategory", "NoShowCategory", "MarketSegment")
for (var in mixed_vars) {
p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), y = Age, fill = factor(CustomerOutcome))) +
geom_boxplot() +
labs(title = paste("Box plot for Age vs", var, "vs CustomerOutcome"),
x = var,
y = "Age") +
scale_fill_manual(values = colors) +
theme_minimal() +
theme(legend.position = "none")
print(p)
}
# Load ggplot2 library
library(ggplot2)
# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")
# Binary variables
binary_vars <- c("SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor",
"SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed",
"SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")
# Visualize binary variables against CustomerOutcome
#Customer Outcome
for (var in binary_vars) {
p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
geom_bar(position = "dodge") +
labs(title = paste("Bar plot for", var, "vs CustomerOutcome"),
x = var,
y = "Count") +
scale_fill_manual(values = colors) +
theme_minimal()
print(p)
}
# Load ggplot2 library
library(ggplot2)
# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")
# Binary variables
binary_vars <- c("SRHighFloor", "SRLowFloor", "SRAccessibleRoom", "SRMediumFloor",
"SRBathtub", "SRShower", "SRCrib", "SRKingSizeBed", "SRTwinBed",
"SRNearElevator", "SRAwayFromElevator", "SRNoAlcoholInMiniBar", "SRQuietRoom")
# Other variables
other_vars <- c("MarketSegment", "CheckedInCategory", "PersonsNights")
# Visualize binary variables against MarketSegment, CheckedInCategory, and PersonsNight
for (other_var in other_vars) {
for (var in binary_vars) {
p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
geom_bar(position = "dodge") +
facet_grid(paste(". ~", other_var)) +
labs(title = paste("Bar plot for", var, "vs CustomerOutcome by", other_var),
x = var,
y = "Count") +
scale_fill_manual(values = colors) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(p)
}
}
# Load ggplot2 library
library(ggplot2)
# Set a color palette for better visualization
colors <- c("#1f78b4", "#33a02c")
# Categorical variables
categorical_vars <- c("DistributionChannel", "MarketSegment", "Continent", "CheckedInCategory")
# Visualize categorical variables against CustomerOutcome
for (var in categorical_vars) {
p <- ggplot(HotelLisbon_data, aes(x = factor(get(var)), fill = factor(CustomerOutcome))) +
geom_bar(position = "dodge") +
labs(title = paste("Bar plot for", var, "vs CustomerOutcome"),
x = var,
y = "Count") +
scale_fill_manual(values = colors) +
theme_minimal()
print(p)
}
# Numeric variables
numeric_vars <- c("Age", "LodgingRevenue", "OtherRevenue", "PersonsNights", "RoomNights")
# Visualize numeric variables against CustomerOutcome using box plots
for (var in numeric_vars) {
p <- ggplot(HotelLisbon_data, aes(x = factor(CustomerOutcome), y = get(var), fill = factor(CustomerOutcome))) +
geom_boxplot() +
labs(title = paste("Box plot for", var, "vs CustomerOutcome"),
x = "CustomerOutcome",
y = var) +
scale_fill_manual(values = colors) +
theme_minimal()
print(p)
}
#Bar Plot for Market Segments: Visualizing the number of customers in each market segment:
library(ggplot2)
ggplot(HotelLisbon_data, aes(x = MarketSegment)) +
geom_bar(fill = "blue", color = "black") +
theme_minimal() +
labs(title = "Number of Customers in Each Market Segment", x = "Market Segment", y = "Count")
#Histogram for Bookings checked in
ggplot(HotelLisbon_data, aes(x = as.factor(BookingsCheckedIn), y = LodgingRevenue)) +
geom_boxplot(outlier.color = "red", fill = "lightgreen") +
labs(title = "Lodging Revenue by Bookings Checked In", x = "Bookings Checked In", y = "Lodging Revenue") +
scale_x_discrete(labels = c("0" = "No", "1" = "Yes"))
ggplot(HotelLisbon_data, aes(x = as.factor(BookingsNoShowed), fill = as.factor(BookingsCheckedIn))) +
geom_bar(position = "dodge") +
labs(title = "Bookings No Shows vs. Bookings Checked In", x = "Bookings No Shows", y = "Count", fill = "Checked In") +
scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))
ggplot(HotelLisbon_data, aes(x = as.factor(BookingsCanceled), fill = as.factor(BookingsCheckedIn))) +
geom_bar(position = "dodge") +
labs(title = "Bookings Cancelled vs. Bookings Checked In", x = "Bookings Cancelled", y = "Count", fill = "Checked In") +
scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))
ggplot(HotelLisbon_data, aes(x = MarketSegment, fill = as.factor(BookingsCheckedIn))) +
geom_bar(position = "stack") +
labs(title = "Market Segment and Bookings Checked In", x = "Market Segment", y = "Count", fill = "Checked In") +
scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))
ggplot(HotelLisbon_data, aes(x = LodgingRevenue, fill = as.factor(BookingsCheckedIn))) +
geom_histogram(position = "identity", alpha = 0.5, bins = 30) +
labs(title = "Lodging Revenue by Bookings Checked In", x = "Lodging Revenue", y = "Frequency") +
scale_fill_discrete(name = "Checked In", labels = c("No", "Yes"))
# Replace 'AnotherNumericVar' with the actual variable name
ggplot(HotelLisbon_data, aes(x = MarketSegment, y = LodgingRevenue, color = as.factor(BookingsCheckedIn))) +
geom_point(alpha = 0.6) +
labs(title = "Scatter Plot of Lodging Revenue vs. Market Segment", x = "Market Segment", y = "Lodging Revenue") +
scale_color_discrete(name = "Checked In", labels = c("No", "Yes"))
#There are skewed graphs and outliers that we need to resolve
# Boxplot to identify outliers
boxplot(HotelLisbon_data$LodgingRevenue, main = "Boxplot for Lodging Revenue")
boxplot(HotelLisbon_data$OtherRevenue, main = "Boxplot for Other Revenue")
# Boxplot to identify outliers in Market Segment adn Distribution channel
boxplot(HotelLisbon_data$MarketSegment, main = "Boxplot for Market Segment")
boxplot(HotelLisbon_data$DistributionChannel, main = "Boxplot for Distribution cannels")
#There are potential outliers in Other Revenue, Bookings canceled and
bookings no shows. We would need to further discuss this with the
professor. Do we use these variables or not. We are confused here.
Now we need to Balance our data #Check Frequency of the Dependent Variable
table(HotelLisbon_data$CustomerOutcome)
##
## 0 1
## 18020 56980
#0 represents customers who canceled or were No shows at the Hotel #1 represents customers who stayed at the hotel
#Visualize the class distribution using a bar plot or pie chart
# Assuming your dependent variable is named "CustomerOutcome"
library(ggplot2)
ggplot(HotelLisbon_data, aes(x = CustomerOutcome)) +
geom_bar() +
labs(title = "Class Distribution of CustomerOutcome")
#0 is definetly our minorty class and we must do Under Sampling here
#Let’s finally check Class Proportions
prop.table(table(HotelLisbon_data$CustomerOutcome))
##
## 0 1
## 0.2402667 0.7597333
colnames(HotelLisbon_data)
## [1] "ID" "Age" "LodgingRevenue"
## [4] "OtherRevenue" "BookingsCanceled" "BookingsNoShowed"
## [7] "BookingsCheckedIn" "PersonsNights" "RoomNights"
## [10] "DistributionChannel" "MarketSegment" "SRHighFloor"
## [13] "SRLowFloor" "SRAccessibleRoom" "SRMediumFloor"
## [16] "SRBathtub" "SRShower" "SRCrib"
## [19] "SRKingSizeBed" "SRTwinBed" "SRNearElevator"
## [22] "SRAwayFromElevator" "SRNoAlcoholInMiniBar" "SRQuietRoom"
## [25] "CheckedInCategory" "CanceledCategory" "NoShowCategory"
## [28] "CustomerOutcome" "Continent"
#Balancing the Data Set
# Install and load necessary packages
install.packages("ROSE")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'ROSE' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
install.packages("caret")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'caret'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\mujta\AppData\Local\R\win-library\4.3\00LOCK\caret\libs\x64\caret.dll
## to C:\Users\mujta\AppData\Local\R\win-library\4.3\caret\libs\x64\caret.dll:
## Permission denied
## Warning: restored 'caret'
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
install.packages("pROC")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'pROC' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'pROC'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\mujta\AppData\Local\R\win-library\4.3\00LOCK\pROC\libs\x64\pROC.dll to
## C:\Users\mujta\AppData\Local\R\win-library\4.3\pROC\libs\x64\pROC.dll:
## Permission denied
## Warning: restored 'pROC'
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(ROSE)
## Warning: package 'ROSE' was built under R version 4.3.2
## Loaded ROSE 0.0-4
library(caret)
## Warning: package 'caret' was built under R version 4.3.2
## Loading required package: lattice
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Undersampling using ROSE
UnderSampled_HotelLisbon <- ovun.sample(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + BookingsCanceled + BookingsNoShowed + BookingsCheckedIn + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory + CanceledCategory + NoShowCategory
, data = HotelLisbon_data, method = "under", N = 2 * sum(HotelLisbon_data$CustomerOutcome == 0))$data
head(UnderSampled_HotelLisbon)
## Age LodgingRevenue OtherRevenue BookingsCanceled BookingsNoShowed
## 1 46 231.0 31.0 0 0
## 2 35 505.5 63.5 0 0
## 3 25 218.0 26.8 0 0
## 4 49 394.2 101.0 0 0
## 5 20 680.0 5.0 0 0
## 6 34 237.6 34.5 0 0
## BookingsCheckedIn PersonsNights RoomNights DistributionChannel
## 1 1 4 1 Travel Agent/Operator
## 2 1 5 5 Electronic Distribution
## 3 1 4 2 Travel Agent/Operator
## 4 1 9 3 Travel Agent/Operator
## 5 1 5 5 Direct
## 6 1 6 3 Travel Agent/Operator
## MarketSegment SRHighFloor SRLowFloor SRAccessibleRoom SRMediumFloor SRBathtub
## 1 Other 0 0 0 0 0
## 2 Other 0 0 0 0 0
## 3 Other 0 0 0 0 0
## 4 Other 0 0 0 0 0
## 5 Direct 0 0 0 0 0
## 6 Other 0 0 0 0 0
## SRShower SRCrib SRKingSizeBed SRTwinBed SRNearElevator SRAwayFromElevator
## 1 0 0 1 0 0 0
## 2 0 0 0 0 0 0
## 3 0 0 1 0 0 0
## 4 0 0 1 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 1 0 0
## SRNoAlcoholInMiniBar SRQuietRoom CheckedInCategory CanceledCategory
## 1 0 0 Stayed Once Checked In and Stayed
## 2 0 0 Stayed Once Checked In and Stayed
## 3 0 0 Stayed Once Checked In and Stayed
## 4 0 1 Stayed Once Checked In and Stayed
## 5 0 0 Stayed Once Checked In and Stayed
## 6 0 1 Stayed Once Checked In and Stayed
## NoShowCategory CustomerOutcome Continent
## 1 Customers Showed up and Stayed 1 EU
## 2 Customers Showed up and Stayed 1 EU
## 3 Customers Showed up and Stayed 1 EU
## 4 Customers Showed up and Stayed 1 EU
## 5 Customers Showed up and Stayed 1 NA
## 6 Customers Showed up and Stayed 1 EU
# Checking the balance of the undersampled data. If it is indeed balanced.
table(UnderSampled_HotelLisbon$CustomerOutcome)
##
## 1 0
## 18020 18020
# and the Proportions are. Drum Roll, Please!
prop.table(table(UnderSampled_HotelLisbon$CustomerOutcome))
##
## 1 0
## 0.5 0.5
Let’s hope it goes all well from here
#Ok, so now we have to do the traing and testing. With our dataset and Dependent variable: Customer Outcome.
set.seed(123)
trainIndex <- createDataPartition(UnderSampled_HotelLisbon$CustomerOutcome, p = 0.6, list = FALSE)
TrainData <- UnderSampled_HotelLisbon[trainIndex, ]
TestData <- UnderSampled_HotelLisbon[-trainIndex, ]
# Correlation matrix for training data
cor_matrix <- cor(TrainData[, sapply(TrainData, is.numeric)])
print("Correlation Matrix for Training Data:")
## [1] "Correlation Matrix for Training Data:"
print(cor_matrix)
## Age LodgingRevenue OtherRevenue PersonsNights RoomNights
## Age 1.00000000 0.01906695 0.07989865 0.05972095 0.05702762
## LodgingRevenue 0.01906695 1.00000000 0.52716242 0.68260506 0.76958292
## OtherRevenue 0.07989865 0.52716242 1.00000000 0.53561882 0.47859785
## PersonsNights 0.05972095 0.68260506 0.53561882 1.00000000 0.83192274
## RoomNights 0.05702762 0.76958292 0.47859785 0.83192274 1.00000000
install.packages("ggpubr")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'ggpubr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.3.2
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::arrange() masks plyr::arrange()
## ✖ data.table::between() masks dplyr::between()
## ✖ purrr::compact() masks plyr::compact()
## ✖ dplyr::count() masks plyr::count()
## ✖ dplyr::desc() masks plyr::desc()
## ✖ dplyr::failwith() masks plyr::failwith()
## ✖ dplyr::filter() masks stats::filter()
## ✖ data.table::first() masks dplyr::first()
## ✖ data.table::hour() masks lubridate::hour()
## ✖ dplyr::id() masks plyr::id()
## ✖ data.table::isoweek() masks lubridate::isoweek()
## ✖ dplyr::lag() masks stats::lag()
## ✖ data.table::last() masks dplyr::last()
## ✖ purrr::lift() masks caret::lift()
## ✖ data.table::mday() masks lubridate::mday()
## ✖ data.table::minute() masks lubridate::minute()
## ✖ data.table::month() masks lubridate::month()
## ✖ ggpubr::mutate() masks dplyr::mutate(), plyr::mutate()
## ✖ data.table::quarter() masks lubridate::quarter()
## ✖ dplyr::rename() masks plyr::rename()
## ✖ data.table::second() masks lubridate::second()
## ✖ dplyr::summarise() masks plyr::summarise()
## ✖ dplyr::summarize() masks plyr::summarize()
## ✖ purrr::transpose() masks data.table::transpose()
## ✖ data.table::wday() masks lubridate::wday()
## ✖ data.table::week() masks lubridate::week()
## ✖ data.table::yday() masks lubridate::yday()
## ✖ data.table::year() masks lubridate::year()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(devtools)
## Loading required package: usethis
install.packages("psych")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'psych' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(psych)
## Warning: package 'psych' was built under R version 4.3.2
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
install.packages("PerformanceAnalytics")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'PerformanceAnalytics' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'PerformanceAnalytics'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\mujta\AppData\Local\R\win-library\4.3\00LOCK\PerformanceAnalytics\libs\x64\PerformanceAnalytics.dll
## to
## C:\Users\mujta\AppData\Local\R\win-library\4.3\PerformanceAnalytics\libs\x64\PerformanceAnalytics.dll:
## Permission denied
## Warning: restored 'PerformanceAnalytics'
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(PerformanceAnalytics)
## Warning: package 'PerformanceAnalytics' was built under R version 4.3.2
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:data.table':
##
## first, last
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
library("ggplot2")
install.packages("rcompanion")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'rcompanion' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(rcompanion)
## Warning: package 'rcompanion' was built under R version 4.3.2
##
## Attaching package: 'rcompanion'
##
## The following object is masked from 'package:psych':
##
## phi
pairs(data = HotelLisbon_data,
~ Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights)
# Assuming your numeric variables are Age, LodgingRevenue, OtherRevenue, PersonsNights, and RoomNights
log_transformed_data <- log1p(HotelLisbon_data[, c("Age", "LodgingRevenue", "OtherRevenue", "PersonsNights", "RoomNights")])
pairs(data = log_transformed_data,
~ Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights)
# Q-Q plot for a variable
qqnorm(HotelLisbon_data$Age)
qqline(HotelLisbon_data$Age)
ggqqplot(HotelLisbon_data$Age, ylab = "Age")
# Q-Q plot for a variable
qqnorm(HotelLisbon_data$LodgingRevenue)
qqline(HotelLisbon_data$LodgingRevenue)
ggqqplot(HotelLisbon_data$Age, ylab = "Lodging revenue")
qqnorm(HotelLisbon_data$PersonsNights)
qqline(HotelLisbon_data$PersonsNights)
ggqqplot(HotelLisbon_data$Age, ylab = "Persons Per Night")
qqnorm(HotelLisbon_data$RoomNights)
qqline(HotelLisbon_data$RoomNights)
ggqqplot(HotelLisbon_data$Age, ylab = "Rooms per night")
cor_test_result <- cor.test(HotelLisbon_data$Age, HotelLisbon_data$LodgingRevenue, method = "pearson")
print(cor_test_result)
##
## Pearson's product-moment correlation
##
## data: HotelLisbon_data$Age and HotelLisbon_data$LodgingRevenue
## t = -0.34456, df = 74998, p-value = 0.7304
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.008414898 0.005898674
## sample estimates:
## cor
## -0.001258176
cor_test_result <- cor.test(HotelLisbon_data$RoomNights, HotelLisbon_data$LodgingRevenue, method = "pearson")
print(cor_test_result)
##
## Pearson's product-moment correlation
##
## data: HotelLisbon_data$RoomNights and HotelLisbon_data$LodgingRevenue
## t = 261.67, df = 74998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6870703 0.6945530
## sample estimates:
## cor
## 0.6908302
cor_test_result <- cor.test(HotelLisbon_data$Age, HotelLisbon_data$OtherRevenue, method = "pearson")
print(cor_test_result)
##
## Pearson's product-moment correlation
##
## data: HotelLisbon_data$Age and HotelLisbon_data$OtherRevenue
## t = 24.451, df = 74998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.08182382 0.09602423
## sample estimates:
## cor
## 0.08892854
install.packages("corrplot")
## Installing package into 'C:/Users/mujta/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'corrplot' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\mujta\AppData\Local\Temp\RtmpMhGixg\downloaded_packages
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
correlation_matrix <- cor(HotelLisbon_data[, sapply(HotelLisbon_data, is.numeric)])
corrplot(correlation_matrix, method = "color")
Let’s do Logistic regression On Dependent Variable CustomerOutcome with its regressors
# Logistic regression model 1
logistic_model1 <- glm(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory , data = TrainData, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logistic_model1)
##
## Call:
## glm(formula = CustomerOutcome ~ Continent + Age + LodgingRevenue +
## OtherRevenue + PersonsNights + RoomNights + DistributionChannel +
## MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom +
## SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed +
## SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar +
## SRQuietRoom + CheckedInCategory, family = "binomial", data = TrainData)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 5.748e+01 2.652e+03 0.022
## ContinentAN -1.127e+01 2.160e+05 0.000
## ContinentAS -1.013e+00 1.335e+00 -0.759
## ContinentEU -1.182e+00 8.835e-01 -1.337
## ContinentNA -1.378e+00 1.153e+00 -1.195
## ContinentOC -1.794e+01 3.111e+03 -0.006
## ContinentSA -1.730e+01 1.800e+03 -0.010
## Age 3.619e-02 1.441e-02 2.512
## LodgingRevenue 2.743e-04 2.416e-04 1.136
## OtherRevenue -5.470e-04 8.566e-04 -0.639
## PersonsNights -1.272e-01 7.698e-02 -1.653
## RoomNights 7.521e-02 8.839e-02 0.851
## DistributionChannelDirect 1.411e+00 1.063e+00 1.328
## DistributionChannelElectronic Distribution -1.573e+01 9.487e+03 -0.002
## DistributionChannelTravel Agent/Operator -8.461e-01 8.811e-01 -0.960
## MarketSegmentComplementary -2.898e+00 1.287e+00 -2.251
## MarketSegmentCorporate -1.343e+00 5.214e-01 -2.576
## MarketSegmentDirect -4.460e+00 1.186e+00 -3.760
## MarketSegmentGroups -4.324e+00 1.156e+00 -3.742
## MarketSegmentOther -6.733e+00 1.438e+00 -4.681
## MarketSegmentTravel Agent/Operator -2.099e+01 1.660e+03 -0.013
## SRHighFloor1 7.505e-01 8.737e-01 0.859
## SRLowFloor1 -2.732e+01 2.651e+04 -0.001
## SRAccessibleRoom1 -9.037e+00 4.627e+03 -0.002
## SRMediumFloor1 2.901e+00 2.106e+00 1.378
## SRBathtub1 -1.504e+01 5.029e+03 -0.003
## SRShower1 -1.759e+01 1.020e+04 -0.002
## SRCrib1 -1.556e+01 1.865e+03 -0.008
## SRKingSizeBed1 1.050e+00 3.834e-01 2.738
## SRTwinBed1 6.020e-01 6.066e-01 0.992
## SRNearElevator1 -9.540e+00 4.625e+03 -0.002
## SRAwayFromElevator1 -1.159e+00 2.345e+00 -0.494
## SRNoAlcoholInMiniBar1 -1.315e+01 2.979e+04 0.000
## SRQuietRoom1 1.173e+00 7.753e-01 1.514
## CheckedInCategoryGood Loyal Customers -5.570e+01 2.652e+03 -0.021
## CheckedInCategoryGreat Loyal Customers -2.753e+01 7.431e+04 0.000
## CheckedInCategoryStayed Once -6.006e+01 2.652e+03 -0.023
## CheckedInCategoryStayed Thrice -5.669e+01 2.652e+03 -0.021
## CheckedInCategoryStayed Twice -5.826e+01 2.652e+03 -0.022
## Pr(>|z|)
## (Intercept) 0.982708
## ContinentAN 0.999958
## ContinentAS 0.447805
## ContinentEU 0.181121
## ContinentNA 0.232083
## ContinentOC 0.995398
## ContinentSA 0.992333
## Age 0.012001 *
## LodgingRevenue 0.256090
## OtherRevenue 0.523094
## PersonsNights 0.098357 .
## RoomNights 0.394867
## DistributionChannelDirect 0.184191
## DistributionChannelElectronic Distribution 0.998677
## DistributionChannelTravel Agent/Operator 0.336940
## MarketSegmentComplementary 0.024356 *
## MarketSegmentCorporate 0.009982 **
## MarketSegmentDirect 0.000170 ***
## MarketSegmentGroups 0.000183 ***
## MarketSegmentOther 2.85e-06 ***
## MarketSegmentTravel Agent/Operator 0.989909
## SRHighFloor1 0.390328
## SRLowFloor1 0.999178
## SRAccessibleRoom1 0.998442
## SRMediumFloor1 0.168308
## SRBathtub1 0.997614
## SRShower1 0.998624
## SRCrib1 0.993344
## SRKingSizeBed1 0.006175 **
## SRTwinBed1 0.321006
## SRNearElevator1 0.998354
## SRAwayFromElevator1 0.621078
## SRNoAlcoholInMiniBar1 0.999648
## SRQuietRoom1 0.130132
## CheckedInCategoryGood Loyal Customers 0.983244
## CheckedInCategoryGreat Loyal Customers 0.999704
## CheckedInCategoryStayed Once 0.981933
## CheckedInCategoryStayed Thrice 0.982945
## CheckedInCategoryStayed Twice 0.982472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29977.23 on 21623 degrees of freedom
## Residual deviance: 367.02 on 21585 degrees of freedom
## AIC: 445.02
##
## Number of Fisher Scoring iterations: 24
# Logistic regression model 2 , If we take out CheckedInCategory
log_model2 <- glm(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRNoAlcoholInMiniBar + SRLowFloor + SRAccessibleRoom + SRMediumFloor , data = TrainData, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model2)
##
## Call:
## glm(formula = CustomerOutcome ~ Continent + Age + LodgingRevenue +
## OtherRevenue + PersonsNights + RoomNights + DistributionChannel +
## MarketSegment + SRHighFloor + SRNoAlcoholInMiniBar + SRLowFloor +
## SRAccessibleRoom + SRMediumFloor, family = "binomial", data = TrainData)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 4.172e+00 5.757e-01 7.247
## ContinentAN 2.367e+01 3.561e+05 0.000
## ContinentAS 3.498e-03 3.546e-01 0.010
## ContinentEU 2.215e-02 3.136e-01 0.071
## ContinentNA 2.288e-01 3.389e-01 0.675
## ContinentOC 1.484e-01 4.447e-01 0.334
## ContinentSA 2.033e-01 3.683e-01 0.552
## Age 1.814e-03 2.463e-03 0.736
## LodgingRevenue -1.134e-02 6.779e-04 -16.731
## OtherRevenue -3.064e-03 5.476e-04 -5.595
## PersonsNights -2.337e+00 1.369e-01 -17.065
## RoomNights 1.339e+00 1.610e-01 8.317
## DistributionChannelDirect 4.073e-01 3.807e-01 1.070
## DistributionChannelElectronic Distribution -2.579e+00 4.388e-01 -5.878
## DistributionChannelTravel Agent/Operator 1.064e-01 2.612e-01 0.407
## MarketSegmentComplementary -2.694e+00 6.396e-01 -4.212
## MarketSegmentCorporate -2.426e+00 4.869e-01 -4.981
## MarketSegmentDirect -1.864e+00 6.000e-01 -3.107
## MarketSegmentGroups -2.098e+00 5.303e-01 -3.956
## MarketSegmentOther -1.308e+00 5.337e-01 -2.452
## MarketSegmentTravel Agent/Operator -1.439e+00 5.379e-01 -2.676
## SRHighFloor1 3.973e-01 1.930e-01 2.059
## SRNoAlcoholInMiniBar1 1.010e+01 2.256e+02 0.045
## SRLowFloor1 3.335e+00 2.649e+00 1.259
## SRAccessibleRoom1 4.706e+00 3.647e+01 0.129
## SRMediumFloor1 1.235e+01 1.181e+00 10.463
## Pr(>|z|)
## (Intercept) 4.27e-13 ***
## ContinentAN 0.99995
## ContinentAS 0.99213
## ContinentEU 0.94368
## ContinentNA 0.49972
## ContinentOC 0.73866
## ContinentSA 0.58094
## Age 0.46154
## LodgingRevenue < 2e-16 ***
## OtherRevenue 2.20e-08 ***
## PersonsNights < 2e-16 ***
## RoomNights < 2e-16 ***
## DistributionChannelDirect 0.28466
## DistributionChannelElectronic Distribution 4.15e-09 ***
## DistributionChannelTravel Agent/Operator 0.68376
## MarketSegmentComplementary 2.53e-05 ***
## MarketSegmentCorporate 6.31e-07 ***
## MarketSegmentDirect 0.00189 **
## MarketSegmentGroups 7.61e-05 ***
## MarketSegmentOther 0.01422 *
## MarketSegmentTravel Agent/Operator 0.00745 **
## SRHighFloor1 0.03953 *
## SRNoAlcoholInMiniBar1 0.96430
## SRLowFloor1 0.20804
## SRAccessibleRoom1 0.89734
## SRMediumFloor1 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29977.2 on 21623 degrees of freedom
## Residual deviance: 5346.6 on 21598 degrees of freedom
## AIC: 5398.6
##
## Number of Fisher Scoring iterations: 25
#CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + BookingsCanceled + BookingsNoShowed + BookingsCheckedIn + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
#SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory + CanceledCategory + NoShowCategory
#Let’s put all the Bookings categories we have made, CheckedIncategory, CanceledCategory, NoShowCategory.
# Logistic regression model 3 , If we put CheckedIncategory, CanceledCategory, NoShowCategory.
log_model3 <- glm(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRNoAlcoholInMiniBar + SRLowFloor + SRAccessibleRoom + SRMediumFloor + CheckedInCategory + CanceledCategory + NoShowCategory , data = TrainData, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model3)
##
## Call:
## glm(formula = CustomerOutcome ~ Continent + Age + LodgingRevenue +
## OtherRevenue + PersonsNights + RoomNights + DistributionChannel +
## MarketSegment + SRHighFloor + SRNoAlcoholInMiniBar + SRLowFloor +
## SRAccessibleRoom + SRMediumFloor + CheckedInCategory + CanceledCategory +
## NoShowCategory, family = "binomial", data = TrainData)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -2.539e+01 8.864e+04 0.000
## ContinentAN -2.967e-03 3.570e+05 0.000
## ContinentAS -3.175e-03 2.727e+04 0.000
## ContinentEU -5.253e-03 2.439e+04 0.000
## ContinentNA -3.895e-03 2.598e+04 0.000
## ContinentOC -5.024e-03 3.333e+04 0.000
## ContinentSA -5.400e-03 2.707e+04 0.000
## Age 1.615e-05 1.750e+02 0.000
## LodgingRevenue -9.934e-06 1.060e+01 0.000
## OtherRevenue -8.861e-06 2.713e+01 0.000
## PersonsNights -1.178e-03 1.184e+03 0.000
## RoomNights 7.461e-03 2.448e+03 0.000
## DistributionChannelDirect 1.995e-02 3.553e+04 0.000
## DistributionChannelElectronic Distribution 5.523e-03 4.434e+04 0.000
## DistributionChannelTravel Agent/Operator 4.521e-03 2.556e+04 0.000
## MarketSegmentComplementary -1.019e-01 6.420e+04 0.000
## MarketSegmentCorporate -9.995e-02 4.994e+04 0.000
## MarketSegmentDirect -1.217e-01 5.888e+04 0.000
## MarketSegmentGroups -1.100e-01 5.336e+04 0.000
## MarketSegmentOther -1.091e-01 5.346e+04 0.000
## MarketSegmentTravel Agent/Operator -1.105e-01 5.367e+04 0.000
## SRHighFloor1 -1.353e-04 1.116e+04 0.000
## SRNoAlcoholInMiniBar1 -3.238e-03 1.455e+05 0.000
## SRLowFloor1 1.042e-03 6.315e+04 0.000
## SRAccessibleRoom1 -6.869e-03 1.778e+05 0.000
## SRMediumFloor1 3.267e-02 7.686e+04 0.000
## CheckedInCategoryGood Loyal Customers 5.166e+01 8.620e+04 0.001
## CheckedInCategoryGreat Loyal Customers 5.120e+01 1.939e+05 0.000
## CheckedInCategoryStayed Once 5.178e+01 8.636e+04 0.001
## CheckedInCategoryStayed Thrice 5.192e+01 9.698e+04 0.001
## CheckedInCategoryStayed Twice 5.179e+01 8.720e+04 0.001
## CanceledCategoryChecked In and Stayed -5.286e+01 4.964e+04 -0.001
## NoShowCategoryNo Show Customers 5.206e+01 6.285e+04 0.001
## Pr(>|z|)
## (Intercept) 1.000
## ContinentAN 1.000
## ContinentAS 1.000
## ContinentEU 1.000
## ContinentNA 1.000
## ContinentOC 1.000
## ContinentSA 1.000
## Age 1.000
## LodgingRevenue 1.000
## OtherRevenue 1.000
## PersonsNights 1.000
## RoomNights 1.000
## DistributionChannelDirect 1.000
## DistributionChannelElectronic Distribution 1.000
## DistributionChannelTravel Agent/Operator 1.000
## MarketSegmentComplementary 1.000
## MarketSegmentCorporate 1.000
## MarketSegmentDirect 1.000
## MarketSegmentGroups 1.000
## MarketSegmentOther 1.000
## MarketSegmentTravel Agent/Operator 1.000
## SRHighFloor1 1.000
## SRNoAlcoholInMiniBar1 1.000
## SRLowFloor1 1.000
## SRAccessibleRoom1 1.000
## SRMediumFloor1 1.000
## CheckedInCategoryGood Loyal Customers 1.000
## CheckedInCategoryGreat Loyal Customers 1.000
## CheckedInCategoryStayed Once 1.000
## CheckedInCategoryStayed Thrice 1.000
## CheckedInCategoryStayed Twice 1.000
## CanceledCategoryChecked In and Stayed 0.999
## NoShowCategoryNo Show Customers 0.999
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.9977e+04 on 21623 degrees of freedom
## Residual deviance: 1.2577e-07 on 21591 degrees of freedom
## AIC: 66
##
## Number of Fisher Scoring iterations: 25
#CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + BookingsCanceled + BookingsNoShowed + BookingsCheckedIn + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
#SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory + CanceledCategory + NoShowCategory
#I think there is a lot Multicolinearity, because of adding NoShowCategory and CanceledCategory, which have the same results of the Dependent variable.
#Let’s try Model 4. With Only Binary variables.
# Logistic regression model 4 , With only binary Variables / Booleans
log_model4 <- glm(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom, data = TrainData, family = "binomial")
summary(log_model4)
##
## Call:
## glm(formula = CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom +
## SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed +
## SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar +
## SRQuietRoom, family = "binomial", data = TrainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.11793 0.01971 -5.982 2.20e-09 ***
## SRHighFloor1 0.14746 0.06338 2.326 0.0200 *
## SRLowFloor1 0.51092 0.36618 1.395 0.1629
## SRAccessibleRoom1 -1.16324 1.22207 -0.952 0.3412
## SRMediumFloor1 0.25125 0.44576 0.564 0.5730
## SRBathtub1 0.16803 0.24593 0.683 0.4944
## SRShower1 0.32798 0.34936 0.939 0.3478
## SRCrib1 0.94826 0.11792 8.042 8.86e-16 ***
## SRKingSizeBed1 0.12228 0.03004 4.071 4.69e-05 ***
## SRTwinBed1 0.28205 0.04047 6.969 3.19e-12 ***
## SRNearElevator1 0.07786 0.83534 0.093 0.9257
## SRAwayFromElevator1 -0.28074 0.22383 -1.254 0.2098
## SRNoAlcoholInMiniBar1 0.34408 0.88202 0.390 0.6965
## SRQuietRoom1 0.10932 0.04824 2.266 0.0235 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29977 on 21623 degrees of freedom
## Residual deviance: 29835 on 21610 degrees of freedom
## AIC: 29863
##
## Number of Fisher Scoring iterations: 4
#Let’s do Model 5 with All Binary variables and Include, Market Segment, DistributionChannel
# Logistic regression model 5 ,
log_model5 <- glm(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel, data = TrainData, family = "binomial")
summary(log_model5)
##
## Call:
## glm(formula = CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom +
## SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed +
## SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar +
## SRQuietRoom + MarketSegment + DistributionChannel, family = "binomial",
## data = TrainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.95391 0.31511 -3.027 0.00247
## SRHighFloor1 0.07076 0.06375 1.110 0.26702
## SRLowFloor1 0.40436 0.36768 1.100 0.27144
## SRAccessibleRoom1 -1.14097 1.21788 -0.937 0.34883
## SRMediumFloor1 0.34967 0.45598 0.767 0.44317
## SRBathtub1 0.09389 0.24649 0.381 0.70327
## SRShower1 0.21551 0.35020 0.615 0.53830
## SRCrib1 0.89045 0.11896 7.485 7.14e-14
## SRKingSizeBed1 0.06590 0.03217 2.048 0.04051
## SRTwinBed1 0.29122 0.04154 7.011 2.37e-12
## SRNearElevator1 -0.01298 0.83731 -0.016 0.98763
## SRAwayFromElevator1 -0.31053 0.22443 -1.384 0.16647
## SRNoAlcoholInMiniBar1 0.25185 0.87877 0.287 0.77443
## SRQuietRoom1 0.01978 0.04920 0.402 0.68760
## MarketSegmentComplementary 0.62935 0.40265 1.563 0.11805
## MarketSegmentCorporate 0.07846 0.33209 0.236 0.81322
## MarketSegmentDirect 0.95339 0.37508 2.542 0.01103
## MarketSegmentGroups 0.44904 0.34668 1.295 0.19522
## MarketSegmentOther 0.93940 0.34729 2.705 0.00683
## MarketSegmentTravel Agent/Operator 0.71058 0.34813 2.041 0.04124
## DistributionChannelDirect 0.15584 0.20673 0.754 0.45096
## DistributionChannelElectronic Distribution -2.00960 0.34437 -5.836 5.36e-09
## DistributionChannelTravel Agent/Operator 0.01437 0.15040 0.096 0.92387
##
## (Intercept) **
## SRHighFloor1
## SRLowFloor1
## SRAccessibleRoom1
## SRMediumFloor1
## SRBathtub1
## SRShower1
## SRCrib1 ***
## SRKingSizeBed1 *
## SRTwinBed1 ***
## SRNearElevator1
## SRAwayFromElevator1
## SRNoAlcoholInMiniBar1
## SRQuietRoom1
## MarketSegmentComplementary
## MarketSegmentCorporate
## MarketSegmentDirect *
## MarketSegmentGroups
## MarketSegmentOther **
## MarketSegmentTravel Agent/Operator *
## DistributionChannelDirect
## DistributionChannelElectronic Distribution ***
## DistributionChannelTravel Agent/Operator
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29977 on 21623 degrees of freedom
## Residual deviance: 29542 on 21601 degrees of freedom
## AIC: 29588
##
## Number of Fisher Scoring iterations: 4
#Let’s do Model 6 and include Binary variables, include, RoomNights and Market Segment and Distribution Channels.
# Logistic regression model 6 ,
log_model6 <- glm(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel + RoomNights, data = TrainData, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model6)
##
## Call:
## glm(formula = CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom +
## SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed +
## SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar +
## SRQuietRoom + MarketSegment + DistributionChannel + RoomNights,
## family = "binomial", data = TrainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.18547 0.57194 10.815 < 2e-16
## SRHighFloor1 0.11092 0.14007 0.792 0.428418
## SRLowFloor1 1.96078 1.56675 1.251 0.210753
## SRAccessibleRoom1 2.17049 6.01499 0.361 0.718214
## SRMediumFloor1 3.52703 1.26593 2.786 0.005334
## SRBathtub1 0.38978 0.62563 0.623 0.533275
## SRShower1 0.29383 0.81680 0.360 0.719044
## SRCrib1 0.52051 0.24173 2.153 0.031300
## SRKingSizeBed1 0.23344 0.06694 3.487 0.000488
## SRTwinBed1 0.46290 0.09400 4.925 8.45e-07
## SRNearElevator1 -0.64861 1.54320 -0.420 0.674265
## SRAwayFromElevator1 -0.26564 0.46230 -0.575 0.565563
## SRNoAlcoholInMiniBar1 3.91712 10.14007 0.386 0.699274
## SRQuietRoom1 0.28784 0.11443 2.515 0.011887
## MarketSegmentComplementary -3.57790 0.75442 -4.743 2.11e-06
## MarketSegmentCorporate -3.53336 0.59143 -5.974 2.31e-09
## MarketSegmentDirect -3.51872 0.71966 -4.889 1.01e-06
## MarketSegmentGroups -3.59885 0.66047 -5.449 5.07e-08
## MarketSegmentOther -3.54998 0.66291 -5.355 8.55e-08
## MarketSegmentTravel Agent/Operator -3.24452 0.66664 -4.867 1.13e-06
## DistributionChannelDirect -0.33640 0.44360 -0.758 0.448259
## DistributionChannelElectronic Distribution -2.00583 0.53678 -3.737 0.000186
## DistributionChannelTravel Agent/Operator -0.22707 0.34427 -0.660 0.509540
## RoomNights -3.09936 0.04734 -65.473 < 2e-16
##
## (Intercept) ***
## SRHighFloor1
## SRLowFloor1
## SRAccessibleRoom1
## SRMediumFloor1 **
## SRBathtub1
## SRShower1
## SRCrib1 *
## SRKingSizeBed1 ***
## SRTwinBed1 ***
## SRNearElevator1
## SRAwayFromElevator1
## SRNoAlcoholInMiniBar1
## SRQuietRoom1 *
## MarketSegmentComplementary ***
## MarketSegmentCorporate ***
## MarketSegmentDirect ***
## MarketSegmentGroups ***
## MarketSegmentOther ***
## MarketSegmentTravel Agent/Operator ***
## DistributionChannelDirect
## DistributionChannelElectronic Distribution ***
## DistributionChannelTravel Agent/Operator
## RoomNights ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29977.2 on 21623 degrees of freedom
## Residual deviance: 6017.1 on 21600 degrees of freedom
## AIC: 6065.1
##
## Number of Fisher Scoring iterations: 9
#Let’s do Model 7 with Bianry variable and only include RoomNights and PersonsNights
# Logistic regression model 7 ,
log_model7 <- glm(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + PersonsNights + RoomNights, data = TrainData, family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_model7)
##
## Call:
## glm(formula = CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom +
## SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed +
## SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar +
## SRQuietRoom + PersonsNights + RoomNights, family = "binomial",
## data = TrainData)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.566835 0.047261 54.312 < 2e-16 ***
## SRHighFloor1 0.281929 0.182497 1.545 0.12238
## SRLowFloor1 5.575126 1.911342 2.917 0.00354 **
## SRAccessibleRoom1 5.104512 20.729519 0.246 0.80549
## SRMediumFloor1 10.264616 1.666861 6.158 7.36e-10 ***
## SRBathtub1 0.426507 0.831778 0.513 0.60812
## SRShower1 0.503090 1.060393 0.474 0.63519
## SRCrib1 1.145811 0.402574 2.846 0.00442 **
## SRKingSizeBed1 0.411852 0.073145 5.631 1.80e-08 ***
## SRTwinBed1 1.069203 0.139723 7.652 1.97e-14 ***
## SRNearElevator1 -1.305765 1.342244 -0.973 0.33064
## SRAwayFromElevator1 0.125108 0.734191 0.170 0.86469
## SRNoAlcoholInMiniBar1 5.317801 67.825129 0.078 0.93751
## SRQuietRoom1 0.306440 0.135593 2.260 0.02382 *
## PersonsNights -2.328372 0.132391 -17.587 < 2e-16 ***
## RoomNights -0.000517 0.143480 -0.004 0.99713
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 29977.2 on 21623 degrees of freedom
## Residual deviance: 5489.4 on 21608 degrees of freedom
## AIC: 5521.4
##
## Number of Fisher Scoring iterations: 25
Now that we have 7 different Logistic Models. Let’s test them with our test data
library(pROC)
Model 1
# Predictions on test data 1
predictions1 <- predict(logistic_model1, newdata = TestData, type = "response")
# Evaluate model performance
roc_curve1 <- roc(TestData$CustomerOutcome, predictions1)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score1 <- auc(roc_curve1)
# Confusion matrix
conf_matrix1 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions1 > 0.5, 1, 0))
print("Confusion Matrix Model 1:")
## [1] "Confusion Matrix Model 1:"
print(conf_matrix1)
## Predicted
## Actual 0 1
## 1 7189 19
## 0 28 7180
Model 2
# Predictions on test data 2
predictions2 <- predict(log_model2, newdata = TestData, type = "response")
# Evaluate model performance
roc_curve2 <- roc(TestData$CustomerOutcome, predictions2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score2 <- auc(roc_curve2)
# Confusion matrix
conf_matrix2 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions2 > 0.5, 1, 0))
print("Confusion Matrix Model 2:")
## [1] "Confusion Matrix Model 2:"
print(conf_matrix2)
## Predicted
## Actual 0 1
## 1 6904 304
## 0 51 7157
Model 3
# Predictions on test data 3
predictions3 <- predict(log_model3, newdata = TestData, type = "response")
# Evaluate model performance
roc_curve3 <- roc(TestData$CustomerOutcome, predictions3)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score3 <- auc(roc_curve3)
# Confusion matrix
conf_matrix3 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions3 > 0.5, 1, 0))
print("Confusion Matrix Model 3:")
## [1] "Confusion Matrix Model 3:"
print(conf_matrix3)
## Predicted
## Actual 0 1
## 1 7208 0
## 0 0 7208
Model 4
# Predictions on test data
predictions4 <- predict(log_model4, newdata = TestData, type = "response")
# Evaluate model performance
roc_curve4 <- roc(TestData$CustomerOutcome, predictions4)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score4 <- auc(roc_curve4)
# Confusion matrix
conf_matrix4 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions4 > 0.5, 1, 0))
print("Confusion Matrix Model 4:")
## [1] "Confusion Matrix Model 4:"
print(conf_matrix4)
## Predicted
## Actual 0 1
## 1 3582 3626
## 0 3178 4030
Model 5
# Predictions on test data
predictions5 <- predict(log_model5, newdata = TestData, type = "response")
# Evaluate model performance
roc_curve5 <- roc(TestData$CustomerOutcome, predictions5)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score5 <- auc(roc_curve5)
# Confusion matrix
conf_matrix5 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions5 > 0.5, 1, 0))
print("Confusion Matrix Model 5:")
## [1] "Confusion Matrix Model 5:"
print(conf_matrix5)
## Predicted
## Actual 0 1
## 1 3281 3927
## 0 2841 4367
Model 6
# Predictions on test data
predictions6 <- predict(log_model6, newdata = TestData, type = "response")
# Evaluate model performance
roc_curve6 <- roc(TestData$CustomerOutcome, predictions6)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score6 <- auc(roc_curve6)
# Confusion matrix
conf_matrix6 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions6 > 0.5, 1, 0))
print("Confusion Matrix Model 6:")
## [1] "Confusion Matrix Model 6:"
print(conf_matrix6)
## Predicted
## Actual 0 1
## 1 7166 42
## 0 52 7156
Model 7
# Predictions on test data
predictions7 <- predict(log_model7, newdata = TestData, type = "response")
# Evaluate model performance
roc_curve7 <- roc(TestData$CustomerOutcome, predictions7)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score7 <- auc(roc_curve7)
# Confusion matrix
conf_matrix7 <- table(Actual = TestData$CustomerOutcome, Predicted = ifelse(predictions7 > 0.5, 1, 0))
print("Confusion Matrix Model 7:")
## [1] "Confusion Matrix Model 7:"
print(conf_matrix7)
## Predicted
## Actual 0 1
## 1 6766 442
## 0 50 7158
#performance metrics including accuracy, precision, recall, F1-score, and AUC for each model. Below are the codes for the specified models:
# Model 1
accuracy_model1 <- sum(diag(conf_matrix1)) / sum(conf_matrix1)
precision_model1 <- conf_matrix1[2, 2] / sum(conf_matrix1[2, ])
recall_model1 <- conf_matrix1[2, 2] / sum(conf_matrix1[, 2])
f1_score_model1 <- 2 * precision_model1 * recall_model1 / (precision_model1 + recall_model1)
roc_curve_model1 <- roc(TestData$CustomerOutcome, predictions1)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model1 <- auc(roc_curve_model1)
# Display metrics
cat("Model 1 Metrics:\n")
## Model 1 Metrics:
cat("Accuracy:", accuracy_model1, "\n")
## Accuracy: 0.9967397
cat("Precision:", precision_model1, "\n")
## Precision: 0.9961154
cat("Recall:", recall_model1, "\n")
## Recall: 0.9973607
cat("F1-Score:", f1_score_model1, "\n")
## F1-Score: 0.9967377
cat("AUC:", auc_score_model1, "\n\n")
## AUC: 0.9994244
# Model 2
accuracy_model2 <- sum(diag(conf_matrix2)) / sum(conf_matrix2)
precision_model2 <- conf_matrix2[2, 2] / sum(conf_matrix2[2, ])
recall_model2 <- conf_matrix2[2, 2] / sum(conf_matrix2[, 2])
f1_score_model2 <- 2 * precision_model2 * recall_model2 / (precision_model2 + recall_model2)
roc_curve_model2 <- roc(TestData$CustomerOutcome, predictions2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model2 <- auc(roc_curve_model2)
# Display metrics
cat("Model 2 Metrics:\n")
## Model 2 Metrics:
cat("Accuracy:", accuracy_model2, "\n")
## Accuracy: 0.9753746
cat("Precision:", precision_model2, "\n")
## Precision: 0.9929245
cat("Recall:", recall_model2, "\n")
## Recall: 0.9592548
cat("F1-Score:", f1_score_model2, "\n")
## F1-Score: 0.9757993
cat("AUC:", auc_score_model2, "\n\n")
## AUC: 0.9952066
# Model 4
accuracy_model4 <- sum(diag(conf_matrix4)) / sum(conf_matrix4)
precision_model4 <- conf_matrix4[2, 2] / sum(conf_matrix4[2, ])
recall_model4 <- conf_matrix4[2, 2] / sum(conf_matrix4[, 2])
f1_score_model4 <- 2 * precision_model4 * recall_model4 / (precision_model4 + recall_model4)
roc_curve_model4 <- roc(TestData$CustomerOutcome, predictions4)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model4 <- auc(roc_curve_model4)
# Display metrics
cat("Model 4 Metrics:\n")
## Model 4 Metrics:
cat("Accuracy:", accuracy_model4, "\n")
## Accuracy: 0.5280244
cat("Precision:", precision_model4, "\n")
## Precision: 0.559101
cat("Recall:", recall_model4, "\n")
## Recall: 0.5263845
cat("F1-Score:", f1_score_model4, "\n")
## F1-Score: 0.5422497
cat("AUC:", auc_score_model4, "\n\n")
## AUC: 0.5383072
# Model 5
accuracy_model5 <- sum(diag(conf_matrix5)) / sum(conf_matrix5)
precision_model5 <- conf_matrix5[2, 2] / sum(conf_matrix5[2, ])
recall_model5 <- conf_matrix5[2, 2] / sum(conf_matrix5[, 2])
f1_score_model5 <- 2 * precision_model5 * recall_model5 / (precision_model5 + recall_model5)
roc_curve_model5 <- roc(TestData$CustomerOutcome, predictions5)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model5 <- auc(roc_curve_model5)
# Display metrics
cat("Model 5 Metrics:\n")
## Model 5 Metrics:
cat("Accuracy:", accuracy_model5, "\n")
## Accuracy: 0.5305216
cat("Precision:", precision_model5, "\n")
## Precision: 0.6058546
cat("Recall:", recall_model5, "\n")
## Recall: 0.5265252
cat("F1-Score:", f1_score_model5, "\n")
## F1-Score: 0.5634112
cat("AUC:", auc_score_model5, "\n\n")
## AUC: 0.5643688
# Model 6
accuracy_model6 <- sum(diag(conf_matrix6)) / sum(conf_matrix6)
precision_model6 <- conf_matrix6[2, 2] / sum(conf_matrix6[2, ])
recall_model6 <- conf_matrix6[2, 2] / sum(conf_matrix6[, 2])
f1_score_model6 <- 2 * precision_model6 * recall_model6 / (precision_model6 + recall_model6)
roc_curve_model6 <- roc(TestData$CustomerOutcome, predictions6)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model6 <- auc(roc_curve_model6)
# Display metrics
cat("Model 6 Metrics:\n")
## Model 6 Metrics:
cat("Accuracy:", accuracy_model6, "\n")
## Accuracy: 0.9934795
cat("Precision:", precision_model6, "\n")
## Precision: 0.9927858
cat("Recall:", recall_model6, "\n")
## Recall: 0.994165
cat("F1-Score:", f1_score_model6, "\n")
## F1-Score: 0.9934749
cat("AUC:", auc_score_model6, "\n\n")
## AUC: 0.9944694
# Model 7
accuracy_model7 <- sum(diag(conf_matrix7)) / sum(conf_matrix7)
precision_model7 <- conf_matrix7[2, 2] / sum(conf_matrix7[2, ])
recall_model7 <- conf_matrix7[2, 2] / sum(conf_matrix7[, 2])
f1_score_model7 <- 2 * precision_model7 * recall_model7 / (precision_model7 + recall_model7)
roc_curve_model7 <- roc(TestData$CustomerOutcome, predictions7)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_score_model7 <- auc(roc_curve_model7)
# Display metrics
cat("Model 7 Metrics:\n")
## Model 7 Metrics:
cat("Accuracy:", accuracy_model7, "\n")
## Accuracy: 0.9658713
cat("Precision:", precision_model7, "\n")
## Precision: 0.9930633
cat("Recall:", recall_model7, "\n")
## Recall: 0.9418421
cat("F1-Score:", f1_score_model7, "\n")
## F1-Score: 0.9667747
cat("AUC:", auc_score_model7, "\n\n")
## AUC: 0.9952011
#generating the decile lift curve, bar chart, and goodness of fit statistics for each model:
library(caret)
library(pROC)
library(gains)
#CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory
#Model 1
# Assuming that CustomerOutcome factor "0" is the count of interest (did not show up or canceled)
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0
# Add predictions to TestData for Model 1
TestData$predicted_prob = predictions1
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)
# Aggregate data
decile_analysis1 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)
# Plot
ggplot(decile_analysis1, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
geom_bar(stat = "identity", fill="steelblue") +
labs(title = "Decile Analysis for Model 1", x = "Decile", y = "Count of No Shows or Cancellations")
#Model 2
# Assuming that CustomerOutcome factor "0" is the count of interest (did not show up or canceled)
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0
# Add predictions to TestData for Model 2
TestData$predicted_prob = predictions2
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)
# Aggregate data
decile_analysis2 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)
# Plot
ggplot(decile_analysis2, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
geom_bar(stat = "identity", fill="steelblue") +
labs(title = "Decile Analysis for Model 2", x = "Decile", y = "Count of No Shows or Cancellations")
#Model 3
# Assuming that CustomerOutcome factor "0" is the count of interest (did not show up or canceled)
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0
# Add predictions to TestData for Model 3
TestData$predicted_prob = predictions3
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)
# Aggregate data
decile_analysis3 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)
# Plot
ggplot(decile_analysis3, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
geom_bar(stat = "identity", fill="steelblue") +
labs(title = "Decile Analysis for Model 3", x = "Decile", y = "Count of No Shows or Cancellations")
#Model 4
# Assuming that CustomerOutcome factor "0" is the count of interest (did not show up or canceled)
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0
# Add predictions to TestData for Model 4
TestData$predicted_prob = predictions4
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)
# Aggregate data
decile_analysis4 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)
# Plot
ggplot(decile_analysis4, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
geom_bar(stat = "identity", fill="steelblue") +
labs(title = "Decile Analysis for Model 4", x = "Decile", y = "Count of No Shows or Cancellations")
# Assuming that CustomerOutcome factor "0" is the count of interest (did not show up or canceled)
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0
# Add predictions to TestData for Model 5
TestData$predicted_prob = predictions5
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)
# Aggregate data
decile_analysis5 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)
# Plot
ggplot(decile_analysis5, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
geom_bar(stat = "identity", fill="steelblue") +
labs(title = "Decile Analysis for Model 5", x = "Decile", y = "Count of No Shows or Cancellations")
#Model 6
# Assuming that CustomerOutcome factor "0" is the count of interest (did not show up or canceled)
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0
# Add predictions to TestData for Model 6
TestData$predicted_prob = predictions6
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)
# Aggregate data
decile_analysis6 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)
# Plot
ggplot(decile_analysis6, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
geom_bar(stat = "identity", fill="steelblue") +
labs(title = "Decile Analysis for Model 6", x = "Decile", y = "Count of No Shows or Cancellations")
#Model 7
# Assuming that CustomerOutcome factor "0" is the count of interest (did not show up or canceled)
TestData$CustomerOutcomeNumeric = as.numeric(levels(TestData$CustomerOutcome))[TestData$CustomerOutcome] == 0
# Add predictions to TestData for Model 7
TestData$predicted_prob = predictions7
TestData$decile = cut(TestData$predicted_prob, breaks=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), include.lowest=TRUE)
# Aggregate data
decile_analysis7 = aggregate(CustomerOutcomeNumeric ~ decile, data = TestData, FUN=sum)
# Plot
ggplot(decile_analysis7, aes(x = as.factor(decile), y = CustomerOutcomeNumeric)) +
geom_bar(stat = "identity", fill="steelblue") +
labs(title = "Decile Analysis for Model 7", x = "Decile", y = "Count of No Shows or Cancellations")
NoN Parametric Analysis
#Using Random forest #using Log_model5
# Load the necessary library
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:psych':
##
## outlier
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
# Train the Random Forest model
rf_model5 <- randomForest(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel, data = TrainData, ntree = 500)
# Make predictions on the test set
rf_predictions5 <- predict(rf_model5, newdata = TestData)
# Evaluate model performance
rf_conf_matrix5 <- table(TestData$CustomerOutcome, rf_predictions5)
rf_accuracy5 <- sum(diag(rf_conf_matrix5)) / sum(rf_conf_matrix5)
# Print the confusion matrix and accuracy
print(rf_conf_matrix5)
## rf_predictions5
## 1 0
## 1 1908 5300
## 0 1082 6126
print(paste("Accuracy:", rf_accuracy5))
## [1] "Accuracy: 0.557297447280799"
#Using Lod_model6 # Load the necessary library
# Load the necessary library
library(randomForest)
# Train the Random Forest model
rf_model6 <- randomForest(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel + RoomNights, data = TrainData, ntree = 500)
# Make predictions on the test set
rf_predictions6 <- predict(rf_model6, newdata = TestData)
# Evaluate model performance
rf_conf_matrix6 <- table(TestData$CustomerOutcome, rf_predictions6)
rf_accuracy6 <- sum(diag(rf_conf_matrix6)) / sum(rf_conf_matrix6)
# Print the confusion matrix and accuracy
print(rf_conf_matrix6)
## rf_predictions6
## 1 0
## 1 7205 3
## 0 47 7161
print(paste("Accuracy:", rf_accuracy6))
## [1] "Accuracy: 0.996531631520533"
#Usine Logistic_model1
# Load the necessary library
library(randomForest)
# Train the Random Forest model
rf_model1 <- randomForest(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory , data = TrainData, ntree = 500)
# Make predictions on the test set
rf_predictions1 <- predict(rf_model1, newdata = TestData)
# Evaluate model performance
rf_conf_matrix1 <- table(TestData$CustomerOutcome, rf_predictions1)
rf_accuracy1 <- sum(diag(rf_conf_matrix1)) / sum(rf_conf_matrix1)
# Print the confusion matrix and accuracy
print(rf_conf_matrix1)
## rf_predictions1
## 1 0
## 1 7196 12
## 0 30 7178
print(paste("Accuracy:", rf_accuracy1))
## [1] "Accuracy: 0.997086570477247"
#Using Model 4
# Load the necessary library
library(randomForest)
# Train the Random Forest model
rf_model4 <- randomForest(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom
, data = TrainData, ntree = 500)
# Make predictions on the test set
rf_predictions4 <- predict(rf_model4, newdata = TestData)
# Evaluate model performance
rf_conf_matrix4 <- table(TestData$CustomerOutcome, rf_predictions4)
rf_accuracy4 <- sum(diag(rf_conf_matrix4)) / sum(rf_conf_matrix4)
# Print the confusion matrix and accuracy
print(rf_conf_matrix4)
## rf_predictions4
## 1 0
## 1 3459 3749
## 0 3067 4141
print(paste("Accuracy:", rf_accuracy4))
## [1] "Accuracy: 0.527192008879023"
#Model 5
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.2
##
## Attaching package: 'e1071'
## The following objects are masked from 'package:PerformanceAnalytics':
##
## kurtosis, skewness
nb_model5 <- naiveBayes(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel, data = TrainData)
nb_predictions5 <- predict(nb_model5, newdata = TestData)
nb_conf_matrix5 <- table(TestData$CustomerOutcome, nb_predictions5)
nb_accuracy5 <- sum(diag(nb_conf_matrix5)) / sum(nb_conf_matrix5)
print(nb_conf_matrix5)
## nb_predictions5
## 1 0
## 1 3370 3838
## 0 2891 4317
print(paste("Naive Bayes Model 5 Accuracy:", nb_accuracy5))
## [1] "Naive Bayes Model 5 Accuracy: 0.533226970033296"
#Model 6
nb_model6 <- naiveBayes(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + MarketSegment + DistributionChannel + RoomNights, data = TrainData)
nb_predictions6 <- predict(nb_model6, newdata = TestData)
nb_conf_matrix6 <- table(TestData$CustomerOutcome, nb_predictions6)
nb_accuracy6 <- sum(diag(nb_conf_matrix6)) / sum(nb_conf_matrix6)
print(nb_conf_matrix6)
## nb_predictions6
## 1 0
## 1 6106 1102
## 0 97 7111
print(paste("Naive Bayes Model 6 Accuracy:", nb_accuracy6))
## [1] "Naive Bayes Model 6 Accuracy: 0.916828523862375"
#Model 1
nb_model1 <- naiveBayes(CustomerOutcome ~ Continent + Age + LodgingRevenue + OtherRevenue + PersonsNights + RoomNights + DistributionChannel + MarketSegment + SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor + SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom + CheckedInCategory, data = TrainData)
nb_predictions1 <- predict(nb_model1, newdata = TestData)
nb_conf_matrix1 <- table(TestData$CustomerOutcome, nb_predictions1)
nb_accuracy1 <- sum(diag(nb_conf_matrix1)) / sum(nb_conf_matrix1)
print(nb_conf_matrix1)
## nb_predictions1
## 1 0
## 1 7204 4
## 0 52 7156
print(paste("Naive Bayes Model 1 Accuracy:", nb_accuracy1))
## [1] "Naive Bayes Model 1 Accuracy: 0.996115427302997"
#Model 4
nb_model4 <- naiveBayes(CustomerOutcome ~ SRHighFloor + SRLowFloor + SRAccessibleRoom + SRMediumFloor +
SRBathtub + SRShower + SRCrib + SRKingSizeBed + SRTwinBed + SRNearElevator + SRAwayFromElevator + SRNoAlcoholInMiniBar + SRQuietRoom, data = TrainData)
nb_predictions4 <- predict(nb_model4, newdata = TestData)
nb_conf_matrix4 <- table(TestData$CustomerOutcome, nb_predictions4)
nb_accuracy4 <- sum(diag(nb_conf_matrix4)) / sum(nb_conf_matrix4)
print(nb_conf_matrix4)
## nb_predictions4
## 1 0
## 1 5382 1826
## 0 5000 2208
print(paste("Naive Bayes Model 4 Accuracy:", nb_accuracy4))
## [1] "Naive Bayes Model 4 Accuracy: 0.52649833518313"
#Let’s make AUC and ROC curve
#Model 1
# Load the pROC library
library(pROC)
# For model 1
roc_curve_model1 <- roc(TestData$CustomerOutcome, predictions1)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model1 <- auc(roc_curve_model1)
plot(roc_curve_model1, main="ROC Curve for Model 1")
print(paste("AUC for Model 1:", auc_model1))
## [1] "AUC for Model 1: 0.999424427907825"
# Replace predictions1 with the appropriate predictions for other models
# and repeat the process to get the ROC curve and AUC for each.
#Model 2
# Model 2
roc_curve_model2 <- roc(TestData$CustomerOutcome, predictions2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model2 <- auc(roc_curve_model2)
plot(roc_curve_model2, main="ROC Curve for Model 2")
print(paste("AUC for Model 2:", auc_model2))
## [1] "AUC for Model 2: 0.995206597352676"
#Model 4
# Model 4
roc_curve_model4 <- roc(TestData$CustomerOutcome, predictions4)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model4 <- auc(roc_curve_model4)
plot(roc_curve_model4, main="ROC Curve for Model 4")
print(paste("AUC for Model 4:", auc_model4))
## [1] "AUC for Model 4: 0.538307196360315"
#Model 5
# Model 5
roc_curve_model5 <- roc(TestData$CustomerOutcome, predictions5)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model5 <- auc(roc_curve_model5)
plot(roc_curve_model5, main="ROC Curve for Model 5")
print(paste("AUC for Model 5:", auc_model5))
## [1] "AUC for Model 5: 0.56436877117976"
#Model 6
# Model 6
roc_curve_model6 <- roc(TestData$CustomerOutcome, predictions6)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model6 <- auc(roc_curve_model6)
plot(roc_curve_model6, main="ROC Curve for Model 6")
print(paste("AUC for Model 6:", auc_model6))
## [1] "AUC for Model 6: 0.994469443943159"
#Model 7
# Model 7
roc_curve_model7 <- roc(TestData$CustomerOutcome, predictions7)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
auc_model7 <- auc(roc_curve_model7)
plot(roc_curve_model7, main="ROC Curve for Model 7")
print(paste("AUC for Model 7:", auc_model7))
## [1] "AUC for Model 7: 0.995201131111565"
#saving our data set file
write.csv(HotelLisbon_data, "New_HotelLisbon_data.csv", row.names = FALSE)